diff --git a/.devops/cpu.Dockerfile b/.devops/cpu.Dockerfile
new file mode 100644
index 00000000000..8d020f16c97
--- /dev/null
+++ b/.devops/cpu.Dockerfile
@@ -0,0 +1,81 @@
+ARG UBUNTU_VERSION=22.04
+
+FROM ubuntu:$UBUNTU_VERSION AS build
+
+RUN apt-get update && \
+    apt-get install -y build-essential git cmake libcurl4-openssl-dev
+
+WORKDIR /app
+
+COPY . .
+
+RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
+    cmake --build build -j $(nproc)
+
+RUN mkdir -p /app/lib && \
+    find build -name "*.so" -exec cp {} /app/lib \;
+
+RUN mkdir -p /app/full \
+    && cp build/bin/* /app/full \
+    && cp *.py /app/full \
+    && cp -r gguf-py /app/full \
+    && cp -r requirements /app/full \
+    && cp requirements.txt /app/full \
+    && cp .devops/tools.sh /app/full/tools.sh
+
+## Base image
+FROM ubuntu:$UBUNTU_VERSION AS base
+
+RUN apt-get update \
+    && apt-get install -y libgomp1 curl\
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+
+COPY --from=build /app/lib/ /app
+
+### Full
+FROM base AS full
+
+COPY --from=build /app/full /app
+
+WORKDIR /app
+
+RUN apt-get update \
+    && apt-get install -y \
+    git \
+    python3 \
+    python3-pip \
+    && pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt \
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+
+ENTRYPOINT ["/app/tools.sh"]
+
+### Light, CLI only
+FROM base AS light
+
+COPY --from=build /app/full/llama-cli /app
+
+WORKDIR /app
+
+ENTRYPOINT [ "/app/llama-cli" ]
+
+### Server, Server only
+FROM base AS server
+
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+COPY --from=build /app/full/llama-server /app
+
+WORKDIR /app
+
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
+ENTRYPOINT [ "/app/llama-server" ]
diff --git a/.devops/cuda.Dockerfile b/.devops/cuda.Dockerfile
new file mode 100644
index 00000000000..974dd78a8b0
--- /dev/null
+++ b/.devops/cuda.Dockerfile
@@ -0,0 +1,94 @@
+ARG UBUNTU_VERSION=22.04
+# This needs to generally match the container host's environment.
+ARG CUDA_VERSION=12.6.0
+# Target the CUDA build image
+ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+
+ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+
+FROM ${BASE_CUDA_DEV_CONTAINER} AS build
+
+# CUDA architecture to build for (defaults to all supported archs)
+ARG CUDA_DOCKER_ARCH=default
+
+RUN apt-get update && \
+    apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
+
+WORKDIR /app
+
+COPY . .
+
+RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
+    export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
+    fi && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake --build build --config Release -j$(nproc)
+
+RUN mkdir -p /app/lib && \
+    find build -name "*.so" -exec cp {} /app/lib \;
+
+RUN mkdir -p /app/full \
+    && cp build/bin/* /app/full \
+    && cp *.py /app/full \
+    && cp -r gguf-py /app/full \
+    && cp -r requirements /app/full \
+    && cp requirements.txt /app/full \
+    && cp .devops/tools.sh /app/full/tools.sh
+
+## Base image
+FROM ${BASE_CUDA_RUN_CONTAINER} AS base
+
+RUN apt-get update \
+    && apt-get install -y libgomp1 curl\
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+
+COPY --from=build /app/lib/ /app
+
+### Full
+FROM base AS full
+
+COPY --from=build /app/full /app
+
+WORKDIR /app
+
+RUN apt-get update \
+    && apt-get install -y \
+    git \
+    python3 \
+    python3-pip \
+    && pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt \
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+
+
+ENTRYPOINT ["/app/tools.sh"]
+
+### Light, CLI only
+FROM base AS light
+
+COPY --from=build /app/full/llama-cli /app
+
+WORKDIR /app
+
+ENTRYPOINT [ "/app/llama-cli" ]
+
+### Server, Server only
+FROM base AS server
+
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+COPY --from=build /app/full/llama-server /app
+
+WORKDIR /app
+
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
+ENTRYPOINT [ "/app/llama-server" ]
diff --git a/.devops/full-cuda.Dockerfile b/.devops/full-cuda.Dockerfile
deleted file mode 100644
index 05bff1bdf6a..00000000000
--- a/.devops/full-cuda.Dockerfile
+++ /dev/null
@@ -1,33 +0,0 @@
-ARG UBUNTU_VERSION=22.04
-# This needs to generally match the container host's environment.
-ARG CUDA_VERSION=12.6.0
-# Target the CUDA build image
-ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
-
-FROM ${BASE_CUDA_DEV_CONTAINER} AS build
-
-# CUDA architecture to build for (defaults to all supported archs)
-ARG CUDA_DOCKER_ARCH=default
-
-RUN apt-get update && \
-    apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
-
-COPY requirements.txt   requirements.txt
-COPY requirements       requirements
-
-RUN pip install --upgrade pip setuptools wheel \
-    && pip install -r requirements.txt
-
-WORKDIR /app
-
-COPY . .
-
-# Use the default CUDA archs if not specified
-RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
-        export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
-    fi && \
-    cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
-    cmake --build build --config Release -j$(nproc) && \
-    cp build/bin/* .
-
-ENTRYPOINT ["/app/.devops/tools.sh"]
diff --git a/.devops/full-musa.Dockerfile b/.devops/full-musa.Dockerfile
deleted file mode 100644
index 3193fea1e9a..00000000000
--- a/.devops/full-musa.Dockerfile
+++ /dev/null
@@ -1,33 +0,0 @@
-ARG UBUNTU_VERSION=22.04
-# This needs to generally match the container host's environment.
-ARG MUSA_VERSION=rc3.1.0
-# Target the MUSA build image
-ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
-
-FROM ${BASE_MUSA_DEV_CONTAINER} AS build
-
-# MUSA architecture to build for (defaults to all supported archs)
-ARG MUSA_DOCKER_ARCH=default
-
-RUN apt-get update && \
-    apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
-
-COPY requirements.txt   requirements.txt
-COPY requirements       requirements
-
-RUN pip install --upgrade pip setuptools wheel \
-    && pip install -r requirements.txt
-
-WORKDIR /app
-
-COPY . .
-
-# Use the default MUSA archs if not specified
-RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
-        export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
-    fi && \
-    cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
-    cmake --build build --config Release -j$(nproc) && \
-    cp build/bin/* .
-
-ENTRYPOINT ["/app/.devops/tools.sh"]
diff --git a/.devops/full-rocm.Dockerfile b/.devops/full-rocm.Dockerfile
deleted file mode 100644
index df496bcd2b7..00000000000
--- a/.devops/full-rocm.Dockerfile
+++ /dev/null
@@ -1,50 +0,0 @@
-ARG UBUNTU_VERSION=22.04
-
-# This needs to generally match the container host's environment.
-ARG ROCM_VERSION=5.6
-
-# Target the CUDA build image
-ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
-
-FROM ${BASE_ROCM_DEV_CONTAINER} AS build
-
-# Unless otherwise specified, we make a fat build.
-# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
-# This is mostly tied to rocBLAS supported archs.
-ARG ROCM_DOCKER_ARCH="\
-    gfx803 \
-    gfx900 \
-    gfx906 \
-    gfx908 \
-    gfx90a \
-    gfx1010 \
-    gfx1030 \
-    gfx1100 \
-    gfx1101 \
-    gfx1102"
-
-COPY requirements.txt   requirements.txt
-COPY requirements       requirements
-
-RUN pip install --upgrade pip setuptools wheel \
-    && pip install -r requirements.txt
-
-WORKDIR /app
-
-COPY . .
-
-# Set nvcc architecture
-ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
-# Enable ROCm
-ENV GGML_HIPBLAS=1
-ENV CC=/opt/rocm/llvm/bin/clang
-ENV CXX=/opt/rocm/llvm/bin/clang++
-
-# Enable cURL
-ENV LLAMA_CURL=1
-RUN apt-get update && \
-    apt-get install -y libcurl4-openssl-dev
-
-RUN make -j$(nproc)
-
-ENTRYPOINT ["/app/.devops/tools.sh"]
diff --git a/.devops/full.Dockerfile b/.devops/full.Dockerfile
deleted file mode 100644
index d93c0be6a70..00000000000
--- a/.devops/full.Dockerfile
+++ /dev/null
@@ -1,38 +0,0 @@
-ARG UBUNTU_VERSION=22.04
-
-FROM ubuntu:$UBUNTU_VERSION AS build
-
-RUN apt-get update && \
-    apt-get install -y build-essential git cmake libcurl4-openssl-dev
-
-WORKDIR /app
-
-COPY . .
-
-RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
-    cmake --build build -j $(nproc) && \
-    mkdir -p /app/lib && \
-    find build -name "*.so" -exec cp {} /app/lib/ \;
-
-FROM ubuntu:$UBUNTU_VERSION as runtime
-
-WORKDIR /app
-
-RUN apt-get update && \
-    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
-
-COPY requirements.txt   /app/requirements.txt
-COPY requirements       /app/requirements
-COPY .devops/tools.sh   /app/tools.sh
-
-RUN pip install --upgrade pip setuptools wheel && \
-    pip install -r /app/requirements.txt
-
-COPY --from=build /app/build/bin/ /app/
-COPY --from=build /app/lib/ /app/
-COPY --from=build /app/convert_hf_to_gguf.py /app/
-COPY --from=build /app/gguf-py /app/gguf-py
-
-ENV LC_ALL=C.utf8
-
-ENTRYPOINT ["/app/tools.sh"]
diff --git a/.devops/intel.Dockerfile b/.devops/intel.Dockerfile
new file mode 100644
index 00000000000..af783f5e998
--- /dev/null
+++ b/.devops/intel.Dockerfile
@@ -0,0 +1,91 @@
+ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
+
+## Build Image
+
+FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
+
+ARG GGML_SYCL_F16=OFF
+RUN apt-get update && \
+    apt-get install -y git libcurl4-openssl-dev
+
+WORKDIR /app
+
+COPY . .
+
+RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
+        echo "GGML_SYCL_F16 is set" \
+        && export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
+    fi && \
+    echo "Building with dynamic libs" && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
+    cmake --build build --config Release -j$(nproc)
+
+RUN mkdir -p /app/lib && \
+    find build -name "*.so" -exec cp {} /app/lib \;
+
+RUN mkdir -p /app/full \
+    && cp build/bin/* /app/full \
+    && cp *.py /app/full \
+    && cp -r gguf-py /app/full \
+    && cp -r requirements /app/full \
+    && cp requirements.txt /app/full \
+    && cp .devops/tools.sh /app/full/tools.sh
+
+FROM intel/oneapi-basekit:$ONEAPI_VERSION AS base
+
+RUN apt-get update \
+    && apt-get install -y libgomp1 curl\
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+
+### Full
+FROM base AS full
+
+COPY --from=build /app/lib/ /app
+COPY --from=build /app/full /app
+
+WORKDIR /app
+
+RUN apt-get update \
+    && apt-get install -y \
+    git \
+    python3 \
+    python3-pip \
+    && pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt \
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+
+
+ENTRYPOINT ["/app/tools.sh"]
+
+### Light, CLI only
+FROM base AS light
+
+COPY --from=build /app/lib/ /app
+COPY --from=build /app/full/llama-cli /app
+
+WORKDIR /app
+
+ENTRYPOINT [ "/app/llama-cli" ]
+
+### Server, Server only
+FROM base AS server
+
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+COPY --from=build /app/lib/ /app
+COPY --from=build /app/full/llama-server /app
+
+WORKDIR /app
+
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
+ENTRYPOINT [ "/app/llama-server" ]
+
diff --git a/.devops/llama-cli-cuda.Dockerfile b/.devops/llama-cli-cuda.Dockerfile
deleted file mode 100644
index 7796891d5b5..00000000000
--- a/.devops/llama-cli-cuda.Dockerfile
+++ /dev/null
@@ -1,38 +0,0 @@
-ARG UBUNTU_VERSION=22.04
-# This needs to generally match the container host's environment.
-ARG CUDA_VERSION=12.6.0
-# Target the CUDA build image
-ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
-# Target the CUDA runtime image
-ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
-
-FROM ${BASE_CUDA_DEV_CONTAINER} AS build
-
-# CUDA architecture to build for (defaults to all supported archs)
-ARG CUDA_DOCKER_ARCH=default
-
-RUN apt-get update && \
-    apt-get install -y build-essential git cmake
-
-WORKDIR /app
-
-COPY . .
-
-# Use the default CUDA archs if not specified
-RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
-        export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
-    fi && \
-    cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
-    cmake --build build --config Release --target llama-cli -j$(nproc) && \
-    mkdir -p /app/lib && \
-    find build -name "*.so" -exec cp {} /app/lib \;
-
-FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
-
-RUN apt-get update && \
-    apt-get install -y libgomp1
-
-COPY --from=build /app/lib/ /
-COPY --from=build /app/build/bin/llama-cli /
-
-ENTRYPOINT [ "/llama-cli" ]
diff --git a/.devops/llama-cli-intel.Dockerfile b/.devops/llama-cli-intel.Dockerfile
deleted file mode 100644
index 0706f732a98..00000000000
--- a/.devops/llama-cli-intel.Dockerfile
+++ /dev/null
@@ -1,28 +0,0 @@
-ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
-
-FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
-
-ARG GGML_SYCL_F16=OFF
-RUN apt-get update && \
-    apt-get install -y git
-
-WORKDIR /app
-
-COPY . .
-
-RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
-        echo "GGML_SYCL_F16 is set" && \
-        export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
-    fi && \
-    echo "Building with static libs" && \
-    cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \
-    ${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \
-    cmake --build build --config Release --target llama-cli
-
-FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
-
-COPY --from=build /app/build/bin/llama-cli /llama-cli
-
-ENV LC_ALL=C.utf8
-
-ENTRYPOINT [ "/llama-cli" ]
diff --git a/.devops/llama-cli-musa.Dockerfile b/.devops/llama-cli-musa.Dockerfile
deleted file mode 100644
index e7c75af20e2..00000000000
--- a/.devops/llama-cli-musa.Dockerfile
+++ /dev/null
@@ -1,38 +0,0 @@
-ARG UBUNTU_VERSION=22.04
-# This needs to generally match the container host's environment.
-ARG MUSA_VERSION=rc3.1.0
-# Target the MUSA build image
-ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
-# Target the MUSA runtime image
-ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
-
-FROM ${BASE_MUSA_DEV_CONTAINER} AS build
-
-# MUSA architecture to build for (defaults to all supported archs)
-ARG MUSA_DOCKER_ARCH=default
-
-RUN apt-get update && \
-    apt-get install -y build-essential git cmake
-
-WORKDIR /app
-
-COPY . .
-
-# Use the default MUSA archs if not specified
-RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
-        export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
-    fi && \
-    cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
-    cmake --build build --config Release --target llama-cli -j$(nproc) && \
-    mkdir -p /app/lib && \
-    find build -name "*.so" -exec cp {} /app/lib \;
-
-FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
-
-RUN apt-get update && \
-    apt-get install -y libgomp1
-
-COPY --from=build /app/lib/ /
-COPY --from=build /app/build/bin/llama-cli /llama-cli
-
-ENTRYPOINT [ "/llama-cli" ]
diff --git a/.devops/llama-cli-rocm.Dockerfile b/.devops/llama-cli-rocm.Dockerfile
deleted file mode 100644
index e60c747bdbf..00000000000
--- a/.devops/llama-cli-rocm.Dockerfile
+++ /dev/null
@@ -1,45 +0,0 @@
-ARG UBUNTU_VERSION=22.04
-
-# This needs to generally match the container host's environment.
-ARG ROCM_VERSION=5.6
-
-# Target the CUDA build image
-ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
-
-FROM ${BASE_ROCM_DEV_CONTAINER} AS build
-
-# Unless otherwise specified, we make a fat build.
-# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
-# This is mostly tied to rocBLAS supported archs.
-ARG ROCM_DOCKER_ARCH="\
-    gfx803 \
-    gfx900 \
-    gfx906 \
-    gfx908 \
-    gfx90a \
-    gfx1010 \
-    gfx1030 \
-    gfx1100 \
-    gfx1101 \
-    gfx1102"
-
-COPY requirements.txt   requirements.txt
-COPY requirements       requirements
-
-RUN pip install --upgrade pip setuptools wheel \
-    && pip install -r requirements.txt
-
-WORKDIR /app
-
-COPY . .
-
-# Set nvcc architecture
-ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
-# Enable ROCm
-ENV GGML_HIPBLAS=1
-ENV CC=/opt/rocm/llvm/bin/clang
-ENV CXX=/opt/rocm/llvm/bin/clang++
-
-RUN make -j$(nproc) llama-cli
-
-ENTRYPOINT [ "/app/llama-cli" ]
diff --git a/.devops/llama-cli-vulkan.Dockerfile b/.devops/llama-cli-vulkan.Dockerfile
deleted file mode 100644
index 92a6e047934..00000000000
--- a/.devops/llama-cli-vulkan.Dockerfile
+++ /dev/null
@@ -1,27 +0,0 @@
-ARG UBUNTU_VERSION=jammy
-
-FROM ubuntu:$UBUNTU_VERSION AS build
-
-# Install build tools
-RUN apt update && apt install -y git build-essential cmake wget libgomp1
-
-# Install Vulkan SDK
-RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
-    wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
-    apt update -y && \
-    apt-get install -y vulkan-sdk
-
-# Build it
-WORKDIR /app
-COPY . .
-RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 && \
-    cmake --build build --config Release --target llama-cli
-
-# Clean up
-WORKDIR /
-RUN cp /app/build/bin/llama-cli /llama-cli && \
-    rm -rf /app
-
-ENV LC_ALL=C.utf8
-
-ENTRYPOINT [ "/llama-cli" ]
diff --git a/.devops/llama-cli.Dockerfile b/.devops/llama-cli.Dockerfile
deleted file mode 100644
index be234d55dce..00000000000
--- a/.devops/llama-cli.Dockerfile
+++ /dev/null
@@ -1,29 +0,0 @@
-ARG UBUNTU_VERSION=22.04
-
-FROM ubuntu:$UBUNTU_VERSION AS build
-
-RUN apt-get update && \
-    apt-get install -y build-essential git cmake libcurl4-openssl-dev
-
-WORKDIR /app
-
-COPY . .
-
-RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
-    cmake --build build -j $(nproc) && \
-    mkdir -p /app/lib && \
-    find build -name "*.so" -exec cp {} /app/lib/ \;
-
-FROM ubuntu:$UBUNTU_VERSION AS runtime
-
-WORKDIR /app
-
-RUN apt-get update && \
-    apt-get install -y libcurl4-openssl-dev libgomp1 curl
-
-COPY --from=build /app/build/bin/llama-cli /app/
-COPY --from=build /app/lib/ /app/
-
-ENV LC_ALL=C.utf8
-
-ENTRYPOINT [ "/app/llama-cli" ]
diff --git a/.devops/llama-server-cuda.Dockerfile b/.devops/llama-server-cuda.Dockerfile
deleted file mode 100644
index bf8a198f99f..00000000000
--- a/.devops/llama-server-cuda.Dockerfile
+++ /dev/null
@@ -1,43 +0,0 @@
-ARG UBUNTU_VERSION=22.04
-# This needs to generally match the container host's environment.
-ARG CUDA_VERSION=12.6.0
-# Target the CUDA build image
-ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
-# Target the CUDA runtime image
-ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
-
-FROM ${BASE_CUDA_DEV_CONTAINER} AS build
-
-# CUDA architecture to build for (defaults to all supported archs)
-ARG CUDA_DOCKER_ARCH=default
-
-RUN apt-get update && \
-    apt-get install -y build-essential git cmake libcurl4-openssl-dev
-
-WORKDIR /app
-
-COPY . .
-
-# Use the default CUDA archs if not specified
-RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
-        export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
-    fi && \
-    cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
-    cmake --build build --config Release --target llama-server -j$(nproc) && \
-    mkdir -p /app/lib && \
-    find build -name "*.so" -exec cp {} /app/lib \;
-
-FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
-
-RUN apt-get update && \
-    apt-get install -y libcurl4-openssl-dev libgomp1 curl
-
-COPY --from=build /app/lib/ /
-COPY --from=build /app/build/bin/llama-server /llama-server
-
-# Must be set to 0.0.0.0 so it can listen to requests from host machine
-ENV LLAMA_ARG_HOST=0.0.0.0
-
-HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
-
-ENTRYPOINT [ "/llama-server" ]
diff --git a/.devops/llama-server-intel.Dockerfile b/.devops/llama-server-intel.Dockerfile
deleted file mode 100644
index b503b8cfe10..00000000000
--- a/.devops/llama-server-intel.Dockerfile
+++ /dev/null
@@ -1,34 +0,0 @@
-ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
-
-FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
-
-ARG GGML_SYCL_F16=OFF
-RUN apt-get update && \
-    apt-get install -y git libcurl4-openssl-dev
-
-WORKDIR /app
-
-COPY . .
-
-RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
-        echo "GGML_SYCL_F16 is set" && \
-        export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
-    fi && \
-    echo "Building with dynamic libs" && \
-    cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
-    cmake --build build --config Release --target llama-server
-
-FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
-
-RUN apt-get update && \
-    apt-get install -y libcurl4-openssl-dev curl
-
-COPY --from=build /app/build/bin/llama-server /llama-server
-
-ENV LC_ALL=C.utf8
-# Must be set to 0.0.0.0 so it can listen to requests from host machine
-ENV LLAMA_ARG_HOST=0.0.0.0
-
-HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
-
-ENTRYPOINT [ "/llama-server" ]
diff --git a/.devops/llama-server-musa.Dockerfile b/.devops/llama-server-musa.Dockerfile
deleted file mode 100644
index cebe51d42fa..00000000000
--- a/.devops/llama-server-musa.Dockerfile
+++ /dev/null
@@ -1,43 +0,0 @@
-ARG UBUNTU_VERSION=22.04
-# This needs to generally match the container host's environment.
-ARG MUSA_VERSION=rc3.1.0
-# Target the MUSA build image
-ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
-# Target the MUSA runtime image
-ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
-
-FROM ${BASE_MUSA_DEV_CONTAINER} AS build
-
-# MUSA architecture to build for (defaults to all supported archs)
-ARG MUSA_DOCKER_ARCH=default
-
-RUN apt-get update && \
-    apt-get install -y build-essential git cmake libcurl4-openssl-dev
-
-WORKDIR /app
-
-COPY . .
-
-# Use the default MUSA archs if not specified
-RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
-        export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
-    fi && \
-    cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
-    cmake --build build --config Release --target llama-server -j$(nproc) && \
-    mkdir -p /app/lib && \
-    find build -name "*.so" -exec cp {} /app/lib \;
-
-FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
-
-RUN apt-get update && \
-    apt-get install -y libcurl4-openssl-dev libgomp1 curl
-
-COPY --from=build /app/lib/ /
-COPY --from=build /app/build/bin/llama-server /llama-server
-
-# Must be set to 0.0.0.0 so it can listen to requests from host machine
-ENV LLAMA_ARG_HOST=0.0.0.0
-
-HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
-
-ENTRYPOINT [ "/llama-server" ]
diff --git a/.devops/llama-server-rocm.Dockerfile b/.devops/llama-server-rocm.Dockerfile
deleted file mode 100644
index 8553af75b61..00000000000
--- a/.devops/llama-server-rocm.Dockerfile
+++ /dev/null
@@ -1,54 +0,0 @@
-ARG UBUNTU_VERSION=22.04
-
-# This needs to generally match the container host's environment.
-ARG ROCM_VERSION=5.6
-
-# Target the CUDA build image
-ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
-
-FROM ${BASE_ROCM_DEV_CONTAINER} AS build
-
-# Unless otherwise specified, we make a fat build.
-# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
-# This is mostly tied to rocBLAS supported archs.
-ARG ROCM_DOCKER_ARCH="\
-    gfx803 \
-    gfx900 \
-    gfx906 \
-    gfx908 \
-    gfx90a \
-    gfx1010 \
-    gfx1030 \
-    gfx1100 \
-    gfx1101 \
-    gfx1102"
-
-COPY requirements.txt   requirements.txt
-COPY requirements       requirements
-
-RUN pip install --upgrade pip setuptools wheel \
-    && pip install -r requirements.txt
-
-WORKDIR /app
-
-COPY . .
-
-# Set nvcc architecture
-ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
-# Enable ROCm
-ENV GGML_HIPBLAS=1
-ENV CC=/opt/rocm/llvm/bin/clang
-ENV CXX=/opt/rocm/llvm/bin/clang++
-# Must be set to 0.0.0.0 so it can listen to requests from host machine
-ENV LLAMA_ARG_HOST=0.0.0.0
-
-# Enable cURL
-ENV LLAMA_CURL=1
-RUN apt-get update && \
-    apt-get install -y libcurl4-openssl-dev curl
-
-RUN make -j$(nproc) llama-server
-
-HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
-
-ENTRYPOINT [ "/app/llama-server" ]
diff --git a/.devops/llama-server-vulkan.Dockerfile b/.devops/llama-server-vulkan.Dockerfile
deleted file mode 100644
index 6aa7867791a..00000000000
--- a/.devops/llama-server-vulkan.Dockerfile
+++ /dev/null
@@ -1,31 +0,0 @@
-ARG UBUNTU_VERSION=jammy
-
-FROM ubuntu:$UBUNTU_VERSION AS build
-
-# Install build tools
-RUN apt update && apt install -y git build-essential cmake wget
-
-# Install Vulkan SDK and cURL
-RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
-    wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
-    apt update -y && \
-    apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
-
-# Build it
-WORKDIR /app
-COPY . .
-RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
-    cmake --build build --config Release --target llama-server
-
-# Clean up
-WORKDIR /
-RUN cp /app/build/bin/llama-server /llama-server && \
-    rm -rf /app
-
-ENV LC_ALL=C.utf8
-# Must be set to 0.0.0.0 so it can listen to requests from host machine
-ENV LLAMA_ARG_HOST=0.0.0.0
-
-HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
-
-ENTRYPOINT [ "/llama-server" ]
diff --git a/.devops/llama-server.Dockerfile b/.devops/llama-server.Dockerfile
deleted file mode 100644
index 72ccde2feae..00000000000
--- a/.devops/llama-server.Dockerfile
+++ /dev/null
@@ -1,33 +0,0 @@
-ARG UBUNTU_VERSION=22.04
-
-FROM ubuntu:$UBUNTU_VERSION AS build
-
-RUN apt-get update && \
-    apt-get install -y build-essential git cmake libcurl4-openssl-dev
-
-WORKDIR /app
-
-COPY . .
-
-RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
-    cmake --build build -j $(nproc) && \
-    mkdir -p /app/lib && \
-    find build -name "*.so" -exec cp {} /app/lib/ \;
-
-FROM ubuntu:$UBUNTU_VERSION AS runtime
-
-WORKDIR /app
-
-RUN apt-get update && \
-    apt-get install -y libcurl4-openssl-dev libgomp1 curl
-
-COPY --from=build /app/build/bin/llama-server /app/
-COPY --from=build /app/lib/ /app/
-
-ENV LC_ALL=C.utf8
-# Must be set to 0.0.0.0 so it can listen to requests from host machine
-ENV LLAMA_ARG_HOST=0.0.0.0
-
-HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
-
-ENTRYPOINT [ "/app/llama-server" ]
diff --git a/.devops/musa.Dockerfile b/.devops/musa.Dockerfile
new file mode 100644
index 00000000000..bfd7fc1c174
--- /dev/null
+++ b/.devops/musa.Dockerfile
@@ -0,0 +1,108 @@
+ARG UBUNTU_VERSION=22.04
+# This needs to generally match the container host's environment.
+ARG MUSA_VERSION=rc3.1.0
+# Target the MUSA build image
+ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+
+ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+
+FROM ${BASE_MUSA_DEV_CONTAINER} AS build
+
+# MUSA architecture to build for (defaults to all supported archs)
+ARG MUSA_DOCKER_ARCH=default
+
+RUN apt-get update && \
+    apt-get install -y \
+    build-essential \
+    cmake \
+    python3 \
+    python3-pip \
+    git \
+    libcurl4-openssl-dev \
+    libgomp1
+
+COPY requirements.txt   requirements.txt
+COPY requirements       requirements
+
+RUN pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt
+
+WORKDIR /app
+
+COPY . .
+
+# Use the default MUSA archs if not specified
+RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
+        export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
+    fi && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake --build build --config Release -j$(nproc)
+
+RUN mkdir -p /app/lib && \
+    find build -name "*.so" -exec cp {} /app/lib \;
+
+RUN mkdir -p /app/full \
+    && cp build/bin/* /app/full \
+    && cp *.py /app/full \
+    && cp -r gguf-py /app/full \
+    && cp -r requirements /app/full \
+    && cp requirements.txt /app/full \
+    && cp .devops/tools.sh /app/full/tools.sh
+
+## Base image
+FROM ${BASE_MUSA_RUN_CONTAINER} AS base
+
+RUN apt-get update \
+    && apt-get install -y libgomp1 curl\
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+
+COPY --from=build /app/lib/ /app
+
+### Full
+FROM base AS full
+
+COPY --from=build /app/full /app
+
+WORKDIR /app
+
+RUN apt-get update \
+    && apt-get install -y \
+    git \
+    python3 \
+    python3-pip \
+    && pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt \
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+
+
+ENTRYPOINT ["/app/tools.sh"]
+
+### Light, CLI only
+FROM base AS light
+
+COPY --from=build /app/full/llama-cli /app
+
+WORKDIR /app
+
+ENTRYPOINT [ "/app/llama-cli" ]
+
+### Server, Server only
+FROM base AS server
+
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+COPY --from=build /app/full/llama-server /app
+
+WORKDIR /app
+
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
+ENTRYPOINT [ "/app/llama-server" ]
diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix
index b88e6ca809f..043c4364b95 100644
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -31,6 +31,7 @@
   # Increases the runtime closure size by ~700M
   useMpi ? false,
   useRocm ? config.rocmSupport,
+  rocmGpuTargets ? builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets,
   enableCurl ? true,
   useVulkan ? false,
   llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
@@ -188,7 +189,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
     ]
     ++ optionals useRocm [
       (cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
-      (cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets))
+      (cmakeFeature "CMAKE_HIP_ARCHITECTURES" rocmGpuTargets)
     ]
     ++ optionals useMetalKit [
       (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
diff --git a/.devops/rocm.Dockerfile b/.devops/rocm.Dockerfile
new file mode 100644
index 00000000000..a8088ea00da
--- /dev/null
+++ b/.devops/rocm.Dockerfile
@@ -0,0 +1,113 @@
+ARG UBUNTU_VERSION=24.04
+
+# This needs to generally match the container host's environment.
+ARG ROCM_VERSION=6.3
+ARG AMDGPU_VERSION=6.3
+
+# Target the CUDA build image
+ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
+
+### Build image
+FROM ${BASE_ROCM_DEV_CONTAINER} AS build
+
+# Unless otherwise specified, we make a fat build.
+# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
+# This is mostly tied to rocBLAS supported archs.
+# gfx803, gfx900, gfx1032, gfx1101, gfx1102,not officialy supported
+# gfx906 is deprecated
+#check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.2.4/reference/system-requirements.html
+
+#ARG ROCM_DOCKER_ARCH='gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102'
+ARG ROCM_DOCKER_ARCH=gfx1100
+
+# Set nvcc architectured
+ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
+# Enable ROCm
+# ENV CC=/opt/rocm/llvm/bin/clang
+# ENV CXX=/opt/rocm/llvm/bin/clang++
+
+RUN apt-get update \
+    && apt-get install -y \
+    build-essential \
+    cmake \
+    git \
+    libcurl4-openssl-dev \
+    curl \
+    libgomp1
+
+WORKDIR /app
+
+COPY . .
+
+RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
+    cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON \
+    && cmake --build build --config Release -j$(nproc)
+
+RUN mkdir -p /app/lib \
+    && find build -name "*.so" -exec cp {} /app/lib \;
+
+RUN mkdir -p /app/full \
+    && cp build/bin/* /app/full \
+    && cp *.py /app/full \
+    && cp -r gguf-py /app/full \
+    && cp -r requirements /app/full \
+    && cp requirements.txt /app/full \
+    && cp .devops/tools.sh /app/full/tools.sh
+
+## Base image
+FROM ${BASE_ROCM_DEV_CONTAINER} AS base
+
+RUN apt-get update \
+    && apt-get install -y libgomp1 curl\
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+
+COPY --from=build /app/lib/ /app
+
+### Full
+FROM base AS full
+
+COPY --from=build /app/full /app
+
+WORKDIR /app
+
+RUN apt-get update \
+    && apt-get install -y \
+    git \
+    python3-pip \
+    python3 \
+    python3-wheel\
+    && pip install --break-system-packages --upgrade setuptools \
+    && pip install --break-system-packages -r requirements.txt \
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+
+ENTRYPOINT ["/app/tools.sh"]
+
+### Light, CLI only
+FROM base AS light
+
+COPY --from=build /app/full/llama-cli /app
+
+WORKDIR /app
+
+ENTRYPOINT [ "/app/llama-cli" ]
+
+### Server, Server only
+FROM base AS server
+
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+COPY --from=build /app/full/llama-server /app
+
+WORKDIR /app
+
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
+ENTRYPOINT [ "/app/llama-server" ]
diff --git a/.devops/tools.sh b/.devops/tools.sh
index 24dcfd35079..9a86e6ea018 100755
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@@ -8,11 +8,11 @@ arg1="$1"
 shift
 
 if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
-    python3 ./convert_hf_to_gguf.py "$@"
+    exec python3 ./convert_hf_to_gguf.py "$@"
 elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
-    ./llama-quantize "$@"
+    exec ./llama-quantize "$@"
 elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
-    ./llama-cli "$@"
+    exec ./llama-cli "$@"
 elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
     echo "Converting PTH to GGML..."
     for i in `ls $1/$2/ggml-model-f16.bin*`; do
@@ -20,11 +20,11 @@ elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
             echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
         else
             echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
-            ./llama-quantize "$i" "${i/f16/q4_0}" q4_0
+            exec ./llama-quantize "$i" "${i/f16/q4_0}" q4_0
         fi
     done
 elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
-    ./llama-server "$@"
+    exec ./llama-server "$@"
 else
     echo "Unknown command: $arg1"
     echo "Available commands: "
diff --git a/.devops/vulkan.Dockerfile b/.devops/vulkan.Dockerfile
new file mode 100644
index 00000000000..cfc2162e38b
--- /dev/null
+++ b/.devops/vulkan.Dockerfile
@@ -0,0 +1,88 @@
+ARG UBUNTU_VERSION=jammy
+
+FROM ubuntu:$UBUNTU_VERSION AS build
+
+# Install build tools
+RUN apt update && apt install -y git build-essential cmake wget
+
+# Install Vulkan SDK and cURL
+RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
+    wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
+    apt update -y && \
+    apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
+
+# Build it
+WORKDIR /app
+
+COPY . .
+
+RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
+    cmake --build build --config Release -j$(nproc)
+
+RUN mkdir -p /app/lib && \
+    find build -name "*.so" -exec cp {} /app/lib \;
+
+RUN mkdir -p /app/full \
+    && cp build/bin/* /app/full \
+    && cp *.py /app/full \
+    && cp -r gguf-py /app/full \
+    && cp -r requirements /app/full \
+    && cp requirements.txt /app/full \
+    && cp .devops/tools.sh /app/full/tools.sh
+
+## Base image
+FROM ubuntu:$UBUNTU_VERSION AS base
+
+RUN apt-get update \
+    && apt-get install -y libgomp1 curl\
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+
+COPY --from=build /app/lib/ /app
+
+### Full
+FROM base AS full
+
+COPY --from=build /app/full /app
+
+WORKDIR /app
+
+RUN apt-get update \
+    && apt-get install -y \
+    git \
+    python3 \
+    python3-pip \
+    && pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt \
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+
+ENTRYPOINT ["/app/tools.sh"]
+
+### Light, CLI only
+FROM base AS light
+
+COPY --from=build /app/full/llama-cli /app
+
+WORKDIR /app
+
+ENTRYPOINT [ "/app/llama-cli" ]
+
+### Server, Server only
+FROM base AS server
+
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+COPY --from=build /app/full/llama-server /app
+
+WORKDIR /app
+
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
+ENTRYPOINT [ "/app/llama-server" ]
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 886d33d2d56..a377eff38fb 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -317,7 +317,7 @@ jobs:
           wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
           sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
           sudo apt-get update -y
-          sudo apt-get install -y build-essential vulkan-sdk
+          sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk
 
       - name: Build
         id: cmake_build
@@ -327,6 +327,12 @@ jobs:
           cmake -DGGML_VULKAN=ON ..
           cmake --build . --config Release -j $(nproc)
 
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main --verbose --timeout 900
+
   ubuntu-22-cmake-hip:
     runs-on: ubuntu-22.04
     container: rocm/dev-ubuntu-22.04:6.0.2
@@ -662,6 +668,8 @@ jobs:
             defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
           - build: 'msvc-arm64'
             defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
+          - build: 'llvm-arm64-opencl-adreno'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
 
     steps:
       - name: Clone
@@ -703,6 +711,28 @@ jobs:
         run: |
           choco install ninja
 
+      - name: Install OpenCL Headers and Libs
+        id: install_opencl
+        if: ${{ matrix.build == 'llvm-arm64-opencl-adreno' }}
+        run: |
+          git clone https://github.com/KhronosGroup/OpenCL-Headers
+          cd OpenCL-Headers
+          mkdir build && cd build
+          cmake .. `
+            -DBUILD_TESTING=OFF `
+            -DOPENCL_HEADERS_BUILD_TESTING=OFF `
+            -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
+            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
+          cmake --build . --target install
+          git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
+          cd OpenCL-ICD-Loader
+          mkdir build-arm64-release && cd build-arm64-release
+          cmake .. `
+            -A arm64 `
+            -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
+            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
+          cmake --build . --target install --config release
+
       - name: Build
         id: cmake_build
         run: |
@@ -732,7 +762,7 @@ jobs:
       - name: Test
         id: cmake_test
         # not all machines have native AVX-512
-        if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
+        if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'llvm-arm64-opencl-adreno' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
         run: |
           cd build
           ctest -L main -C Release --verbose --timeout 900
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
index bc2e5020de2..41f1a89eebb 100644
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -34,21 +34,14 @@ jobs:
     strategy:
       matrix:
         config:
-          - { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          - { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          - { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
-          - { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
-          - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
-          - { tag: "light-musa", dockerfile: ".devops/llama-cli-musa.Dockerfile", platforms: "linux/amd64" }
-          - { tag: "server-musa", dockerfile: ".devops/llama-server-musa.Dockerfile", platforms: "linux/amd64" }
-          - { tag: "full-musa", dockerfile: ".devops/full-musa.Dockerfile", platforms: "linux/amd64" }
+          # Multi-stage build
+          - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: false}
+          - { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
+          - { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
+          - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
+          - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
           # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
-          #- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          #- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          #- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          - { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
-          - { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
+          #- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: true }
     steps:
       - name: Check out the repo
         uses: actions/checkout@v4
@@ -56,10 +49,10 @@ jobs:
           fetch-depth: 0 # preserve git history, so we can determine the build number
 
       - name: Set up QEMU
-        uses: docker/setup-qemu-action@v2
+        uses: docker/setup-qemu-action@v3
 
       - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2
+        uses: docker/setup-buildx-action@v3
 
       - name: Log in to Docker Hub
         uses: docker/login-action@v2
@@ -79,25 +72,34 @@ jobs:
 
           # determine tag name postfix (build number, commit hash)
           if [[ "${{ env.GITHUB_BRANCH_NAME }}" == "master" ]]; then
-            TAG_POSTFIX="b${BUILD_NUMBER}"
+            TAG_POSTFIX="-b${BUILD_NUMBER}"
           else
             SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-')
-            TAG_POSTFIX="${SAFE_NAME}-${SHORT_HASH}"
+            TAG_POSTFIX="-${SAFE_NAME}-${SHORT_HASH}"
           fi
-
           # list all tags possible
-          TAGS=""
-          TAGS="${TAGS}ghcr.io/${REPO_OWNER}/${REPO_NAME}:${{ matrix.config.tag }},"
-          TAGS="${TAGS}ghcr.io/${REPO_OWNER}/${REPO_NAME}:${{ matrix.config.tag }}-${TAG_POSTFIX}"
-
-          echo "output_tags=$TAGS" >> $GITHUB_OUTPUT
-          echo "output_tags=$TAGS"  # print out for debugging
+          if [[ "${{ matrix.config.tag }}" == "cpu" ]]; then
+              TYPE=""
+          else
+              TYPE="-${{ matrix.config.tag }}"
+          fi
+          PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:"
+          FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}${TAG_POSTFIX}"
+          LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}${TAG_POSTFIX}"
+          SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}${TAG_POSTFIX}"
+          echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT
+          echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT
+          echo "server_output_tags=$SERVERTAGS" >> $GITHUB_OUTPUT
+          echo "full_output_tags=$FULLTAGS"  # print out for debugging
+          echo "light_output_tags=$LIGHTTAGS"  # print out for debugging
+          echo "server_output_tags=$SERVERTAGS"  # print out for debugging
         env:
           GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
           GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
 
       # https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example
       - name: Free Disk Space (Ubuntu)
+        if: ${{ matrix.config.free_disk_space == true }}
         uses: jlumbroso/free-disk-space@main
         with:
           # this might remove tools that are actually needed,
@@ -113,13 +115,59 @@ jobs:
           docker-images: true
           swap-storage: true
 
-      - name: Build and push Docker image (tagged + versioned)
-        if: ${{ github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
+      - name: Build and push Full Docker image (tagged + versioned)
+        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.full == true }}
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          push: true
+          platforms: ${{ matrix.config.platforms }}
+          # tag list is generated from step above
+          tags: ${{ steps.tag.outputs.full_output_tags }}
+          file: ${{ matrix.config.dockerfile }}
+          target: full
+          provenance: false
+          # using github experimental cache
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+          # return to this if the experimental github cache is having issues
+          #cache-to: type=local,dest=/tmp/.buildx-cache
+          #cache-from: type=local,src=/tmp/.buildx-cache
+
+      - name: Build and push Light Docker image (tagged + versioned)
+        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }}
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          push: true
+          platforms: ${{ matrix.config.platforms }}
+          # tag list is generated from step above
+          tags: ${{ steps.tag.outputs.light_output_tags }}
+          file: ${{ matrix.config.dockerfile }}
+          target: light
+          provenance: false
+          # using github experimental cache
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+          # return to this if the experimental github cache is having issues
+          #cache-to: type=local,dest=/tmp/.buildx-cache
+          #cache-from: type=local,src=/tmp/.buildx-cache
+
+      - name: Build and push Server Docker image (tagged + versioned)
+        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }}
         uses: docker/build-push-action@v6
         with:
           context: .
           push: true
           platforms: ${{ matrix.config.platforms }}
           # tag list is generated from step above
-          tags: ${{ steps.tag.outputs.output_tags }}
+          tags: ${{ steps.tag.outputs.server_output_tags }}
           file: ${{ matrix.config.dockerfile }}
+          target: server
+          provenance: false
+          # using github experimental cache
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+          # return to this if the experimental github cache is having issues
+          #cache-to: type=local,dest=/tmp/.buildx-cache
+          #cache-from: type=local,src=/tmp/.buildx-cache
diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
index 9e66fb68cff..671fe595cdf 100644
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -79,7 +79,7 @@ jobs:
       # Setup nodejs (to be used for verifying bundled index.html)
       - uses: actions/setup-node@v4
         with:
-          node-version: 22
+          node-version: '22.11.0'
 
       - name: Verify bundled index.html
         id: verify_server_index_html
diff --git a/CODEOWNERS b/CODEOWNERS
index 88ab6de4fb6..adeba539583 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -1,3 +1,5 @@
 # collaborators can optionally add themselves here to indicate their availability for reviewing related PRs
 
-ci/ @ggerganov
+/ci/ @ggerganov
+/.devops/ @ngxson
+/examples/server/ @ngxson
diff --git a/Makefile b/Makefile
index bcea450e46b..19ae0d5f1c8 100644
--- a/Makefile
+++ b/Makefile
@@ -22,6 +22,7 @@ BUILD_TARGETS = \
 	llama-infill \
 	llama-llava-cli \
 	llama-minicpmv-cli\
+	llama-qwen2vl-cli\
 	llama-lookahead \
 	llama-lookup \
 	llama-lookup-create \
@@ -1404,6 +1405,14 @@ llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
 
+llama-qwen2vl-cli: examples/llava/qwen2vl-cli.cpp \
+	examples/llava/llava.cpp \
+	examples/llava/llava.h \
+	examples/llava/clip.cpp \
+	examples/llava/clip.h \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
+
 ifeq ($(UNAME_S),Darwin)
 swift: examples/batched.swift
 	(cd examples/batched.swift; make build)
diff --git a/README.md b/README.md
index 6fdd8d9eefb..d6d1958c8fc 100644
--- a/README.md
+++ b/README.md
@@ -98,6 +98,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat)
 - [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a)
 - [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM)
+- [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)
 
 #### Multimodal
 
@@ -110,6 +111,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
 - [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
 - [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
+- [x] [Qwen2-VL](https://huggingface.co/collections/Qwen/qwen2-vl-66cee7455501d7126940800d)
 
 </details>
 
@@ -219,7 +221,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 | [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
 | [MUSA](docs/build.md#musa) | Moore Threads MTT GPU |
 | [CUDA](docs/build.md#cuda) | Nvidia GPU |
-| [hipBLAS](docs/build.md#hipblas) | AMD GPU |
+| [HIP](docs/build.md#hip) | AMD GPU |
 | [Vulkan](docs/build.md#vulkan) | GPU |
 | [CANN](docs/build.md#cann) | Ascend NPU |
 
@@ -412,7 +414,7 @@ To learn more about model quantization, [read this documentation](examples/quant
 [^1]: [examples/perplexity/README.md](examples/perplexity/README.md)
 [^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)
 
-## [`llama-bench`](example/bench)
+## [`llama-bench`](examples/llama-bench)
 
 #### Benchmark the performance of the inference for various parameters.
 
@@ -433,6 +435,20 @@ To learn more about model quantization, [read this documentation](examples/quant
 
     </details>
 
+## [`llama-run`](examples/run)
+
+#### A comprehensive example for running `llama.cpp` models. Useful for inferencing. Used with RamaLama [^3].
+
+- <details>
+    <summary>Run a model with a specific prompt (by default it's pulled from Ollama registry)</summary>
+
+    ```bash
+    llama-run granite-code
+    ```
+
+    </details>
+
+[^3]: [RamaLama](https://github.com/containers/ramalama)
 
 ## [`llama-simple`](examples/simple)
 
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 89862fe1139..df1cdf9a59a 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -81,7 +81,7 @@ set(LLAMA_COMMON_EXTRA_LIBS build_info)
 # Use curl to download model url
 if (LLAMA_CURL)
     find_package(CURL REQUIRED)
-    add_definitions(-DLLAMA_USE_CURL)
+    target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
     include_directories(${CURL_INCLUDE_DIRS})
     find_library(CURL_LIBRARY curl REQUIRED)
     set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
diff --git a/common/arg.cpp b/common/arg.cpp
index 49af3168251..deb11378657 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -119,32 +119,65 @@ std::string common_arg::to_string() {
 // utils
 //
 
-static void common_params_handle_model_default(common_params & params) {
-    if (!params.hf_repo.empty()) {
+static void common_params_handle_model_default(
+        std::string & model,
+        std::string & model_url,
+        std::string & hf_repo,
+        std::string & hf_file) {
+    if (!hf_repo.empty()) {
         // short-hand to avoid specifying --hf-file -> default it to --model
-        if (params.hf_file.empty()) {
-            if (params.model.empty()) {
+        if (hf_file.empty()) {
+            if (model.empty()) {
                 throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n");
             }
-            params.hf_file = params.model;
-        } else if (params.model.empty()) {
+            hf_file = model;
+        } else if (model.empty()) {
             // this is to avoid different repo having same file name, or same file name in different subdirs
-            std::string filename = params.hf_repo + "_" + params.hf_file;
+            std::string filename = hf_repo + "_" + hf_file;
             // to make sure we don't have any slashes in the filename
             string_replace_all(filename, "/", "_");
-            params.model = fs_get_cache_file(filename);
+            model = fs_get_cache_file(filename);
         }
-    } else if (!params.model_url.empty()) {
-        if (params.model.empty()) {
-            auto f = string_split<std::string>(params.model_url, '#').front();
+    } else if (!model_url.empty()) {
+        if (model.empty()) {
+            auto f = string_split<std::string>(model_url, '#').front();
             f = string_split<std::string>(f, '?').front();
-            params.model = fs_get_cache_file(string_split<std::string>(f, '/').back());
+            model = fs_get_cache_file(string_split<std::string>(f, '/').back());
         }
-    } else if (params.model.empty()) {
-        params.model = DEFAULT_MODEL_PATH;
+    } else if (model.empty()) {
+        model = DEFAULT_MODEL_PATH;
     }
 }
 
+const std::vector<ggml_type> kv_cache_types = {
+    GGML_TYPE_F32,
+    GGML_TYPE_F16,
+    GGML_TYPE_BF16,
+    GGML_TYPE_Q8_0,
+    GGML_TYPE_Q4_0,
+    GGML_TYPE_Q4_1,
+    GGML_TYPE_IQ4_NL,
+    GGML_TYPE_Q5_0,
+    GGML_TYPE_Q5_1,
+};
+
+static ggml_type kv_cache_type_from_str(const std::string & s) {
+    for (const auto & type : kv_cache_types) {
+        if (ggml_type_name(type) == s) {
+            return type;
+        }
+    }
+    throw std::runtime_error("Unsupported cache type: " + s);
+}
+
+static std::string get_all_kv_cache_types() {
+    std::ostringstream msg;
+    for (const auto & type : kv_cache_types) {
+        msg << ggml_type_name(type) << (&type == &kv_cache_types.back() ? "" : ", ");
+    }
+    return msg.str();
+}
+
 //
 // CLI argument parsing functions
 //
@@ -247,7 +280,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
         throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
     }
 
-    common_params_handle_model_default(params);
+    // TODO: refactor model params in a common struct
+    common_params_handle_model_default(params.model,         params.model_url,         params.hf_repo,         params.hf_file);
+    common_params_handle_model_default(params.vocoder.model, params.vocoder.model_url, params.vocoder.hf_repo, params.vocoder.hf_file);
 
     if (params.escape) {
         string_process_escapes(params.prompt);
@@ -591,7 +626,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params) {
             params.ctx_shift = false;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
     add_opt(common_arg(
         {"--chunks"}, "N",
         string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
@@ -813,7 +848,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_sparam());
     add_opt(common_arg(
-        {"--sampling-seq"}, "SEQUENCE",
+        {"--sampling-seq", "--sampler-seq"}, "SEQUENCE",
         string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
         [](common_params & params, const std::string & value) {
             params.sampling.samplers = common_sampler_types_from_chars(value);
@@ -826,13 +861,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.sampling.ignore_eos = true;
         }
     ).set_sparam());
-    add_opt(common_arg(
-        {"--penalize-nl"},
-        string_format("penalize newline tokens (default: %s)", params.sampling.penalize_nl ? "true" : "false"),
-        [](common_params & params) {
-            params.sampling.penalize_nl = true;
-        }
-    ).set_sparam());
     add_opt(common_arg(
         {"--temp"}, "N",
         string_format("temperature (default: %.1f)", (double)params.sampling.temp),
@@ -887,6 +915,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         {"--repeat-last-n"}, "N",
         string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sampling.penalty_last_n),
         [](common_params & params, int value) {
+            if (value < -1) {
+                throw std::runtime_error(string_format("error: invalid repeat-last-n = %d\n", value));
+            }
             params.sampling.penalty_last_n = value;
             params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n);
         }
@@ -941,6 +972,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         {"--dry-penalty-last-n"}, "N",
         string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sampling.dry_penalty_last_n),
         [](common_params & params, int value) {
+            if (value < -1) {
+                throw std::runtime_error(string_format("error: invalid dry-penalty-last-n = %d\n", value));
+            }
             params.sampling.dry_penalty_last_n = value;
         }
     ).set_sparam());
@@ -1174,18 +1208,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
     add_opt(common_arg(
         {"-ctk", "--cache-type-k"}, "TYPE",
-        string_format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()),
+        string_format(
+            "KV cache data type for K\n"
+            "allowed values: %s\n"
+            "(default: %s)",
+            get_all_kv_cache_types().c_str(),
+            ggml_type_name(params.cache_type_k)
+        ),
         [](common_params & params, const std::string & value) {
-            // TODO: get the type right here
-            params.cache_type_k = value;
+            params.cache_type_k = kv_cache_type_from_str(value);
         }
     ).set_env("LLAMA_ARG_CACHE_TYPE_K"));
     add_opt(common_arg(
         {"-ctv", "--cache-type-v"}, "TYPE",
-        string_format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()),
+        string_format(
+            "KV cache data type for V\n"
+            "allowed values: %s\n"
+            "(default: %s)",
+            get_all_kv_cache_types().c_str(),
+            ggml_type_name(params.cache_type_v)
+        ),
         [](common_params & params, const std::string & value) {
-            // TODO: get the type right here
-            params.cache_type_v = value;
+            params.cache_type_v = kv_cache_type_from_str(value);
         }
     ).set_env("LLAMA_ARG_CACHE_TYPE_V"));
     add_opt(common_arg(
@@ -1543,6 +1587,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.hf_file = value;
         }
     ).set_env("LLAMA_ARG_HF_FILE"));
+    add_opt(common_arg(
+        {"-hfrv", "--hf-repo-v"}, "REPO",
+        "Hugging Face model repository for the vocoder model (default: unused)",
+        [](common_params & params, const std::string & value) {
+            params.vocoder.hf_repo = value;
+        }
+    ).set_env("LLAMA_ARG_HF_REPO_V"));
+    add_opt(common_arg(
+        {"-hffv", "--hf-file-v"}, "FILE",
+        "Hugging Face model file for the vocoder model (default: unused)",
+        [](common_params & params, const std::string & value) {
+            params.vocoder.hf_file = value;
+        }
+    ).set_env("LLAMA_ARG_HF_FILE_V"));
     add_opt(common_arg(
         {"-hft", "--hf-token"}, "TOKEN",
         "Hugging Face access token (default: value from HF_TOKEN environment variable)",
@@ -2083,35 +2141,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, int value) {
             params.speculative.n_max = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX"));
     add_opt(common_arg(
         {"--draft-min", "--draft-n-min"}, "N",
         string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
         [](common_params & params, int value) {
             params.speculative.n_min = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN"));
     add_opt(common_arg(
         {"--draft-p-split"}, "P",
         string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
         [](common_params & params, const std::string & value) {
             params.speculative.p_split = std::stof(value);
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}).set_env("LLAMA_ARG_DRAFT_P_SPLIT"));
     add_opt(common_arg(
         {"--draft-p-min"}, "P",
         string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min),
         [](common_params & params, const std::string & value) {
             params.speculative.p_min = std::stof(value);
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
     add_opt(common_arg(
         {"-cd", "--ctx-size-draft"}, "N",
         string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
         [](common_params & params, int value) {
             params.speculative.n_ctx = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
     add_opt(common_arg(
         {"-devd", "--device-draft"}, "<dev1,dev2,..>",
         "comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
@@ -2131,14 +2189,34 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                 fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
             }
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
     add_opt(common_arg(
         {"-md", "--model-draft"}, "FNAME",
         "draft model for speculative decoding (default: unused)",
         [](common_params & params, const std::string & value) {
             params.speculative.model = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
+
+    add_opt(common_arg(
+        {"-mv", "--model-vocoder"}, "FNAME",
+        "vocoder model for audio generation (default: unused)",
+        [](common_params & params, const std::string & value) {
+            params.vocoder.model = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
+
+    // model-specific
+    add_opt(common_arg(
+        {"--tts-oute-default"},
+        string_format("use default OuteTTS models (note: can download weights from the internet)"),
+        [](common_params & params) {
+            params.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF";
+            params.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf";
+            params.vocoder.hf_repo = "ggml-org/WavTokenizer";
+            params.vocoder.hf_file = "WavTokenizer-Large-75-F16.gguf";
+        }
+    ).set_examples({LLAMA_EXAMPLE_TTS}));
 
     return ctx_arg;
 }
diff --git a/common/common.cpp b/common/common.cpp
index b1355b835dc..f40e3a289fb 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -940,6 +940,25 @@ struct common_init_result common_init_from_params(common_params & params) {
         params.sampling.ignore_eos = false;
     }
 
+    if (params.sampling.ignore_eos) {
+        for (llama_token i = 0; i < llama_n_vocab(model); i++) {
+            if (llama_token_is_eog(model, i)) {
+                LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
+                params.sampling.logit_bias.push_back({i, -INFINITY});
+            }
+        }
+    }
+
+    if (params.sampling.penalty_last_n == -1) {
+        LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
+        params.sampling.penalty_last_n = llama_n_ctx(lctx);
+    }
+
+    if (params.sampling.dry_penalty_last_n == -1) {
+        LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
+        params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
+    }
+
     if (params.warmup) {
         LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
 
@@ -1015,38 +1034,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
     return mparams;
 }
 
-static ggml_type kv_cache_type_from_str(const std::string & s) {
-    if (s == "f32") {
-        return GGML_TYPE_F32;
-    }
-    if (s == "f16") {
-        return GGML_TYPE_F16;
-    }
-    if (s == "bf16") {
-        return GGML_TYPE_BF16;
-    }
-    if (s == "q8_0") {
-        return GGML_TYPE_Q8_0;
-    }
-    if (s == "q4_0") {
-        return GGML_TYPE_Q4_0;
-    }
-    if (s == "q4_1") {
-        return GGML_TYPE_Q4_1;
-    }
-    if (s == "iq4_nl") {
-        return GGML_TYPE_IQ4_NL;
-    }
-    if (s == "q5_0") {
-        return GGML_TYPE_Q5_0;
-    }
-    if (s == "q5_1") {
-        return GGML_TYPE_Q5_1;
-    }
-
-    throw std::runtime_error("Unsupported cache type: " + s);
-}
-
 struct llama_context_params common_context_params_to_llama(const common_params & params) {
     auto cparams = llama_context_default_params();
 
@@ -1081,8 +1068,8 @@ struct llama_context_params common_context_params_to_llama(const common_params &
         cparams.pooling_type  = LLAMA_POOLING_TYPE_RANK;
     }
 
-    cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
-    cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
+    cparams.type_k = params.cache_type_k;
+    cparams.type_v = params.cache_type_v;
 
     return cparams;
 }
@@ -1108,13 +1095,7 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
 #define CURL_MAX_RETRY 3
 #define CURL_RETRY_DELAY_SECONDS 2
 
-
-static bool starts_with(const std::string & str, const std::string & prefix) {
-    // While we wait for C++20's std::string::starts_with...
-    return str.rfind(prefix, 0) == 0;
-}
-
-static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
+static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
     int remaining_attempts = max_attempts;
 
     while (remaining_attempts > 0) {
@@ -1138,7 +1119,6 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_
 }
 
 static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
-
     // Initialize libcurl
     std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
     if (!curl) {
@@ -1211,11 +1191,13 @@ static bool common_download_file(const std::string & url, const std::string & pa
         std::string etag;
         std::string last_modified;
     };
+
     common_load_model_from_url_headers headers;
+
     {
         typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
         auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
-            common_load_model_from_url_headers *headers = (common_load_model_from_url_headers *) userdata;
+            common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
 
             static std::regex header_regex("([^:]+): (.*)\r\n");
             static std::regex etag_regex("ETag", std::regex_constants::icase);
@@ -1799,7 +1781,9 @@ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm)
             break;
         case 0: // max absolute
             for (int i = 0; i < n; i++) {
-                if (sum < std::abs(inp[i])) sum = std::abs(inp[i]);
+                if (sum < std::abs(inp[i])) {
+                    sum = std::abs(inp[i]);
+                }
             }
             sum /= 32760.0; // make an int16 range
             break;
diff --git a/common/common.h b/common/common.h
index 86e9def47ba..76f2aed851f 100644
--- a/common/common.h
+++ b/common/common.h
@@ -37,9 +37,9 @@ using llama_tokens = std::vector<llama_token>;
 
 // build info
 extern int LLAMA_BUILD_NUMBER;
-extern char const * LLAMA_COMMIT;
-extern char const * LLAMA_COMPILER;
-extern char const * LLAMA_BUILD_TARGET;
+extern const char * LLAMA_COMMIT;
+extern const char * LLAMA_COMPILER;
+extern const char * LLAMA_BUILD_TARGET;
 
 struct common_control_vector_load_info;
 
@@ -80,6 +80,7 @@ enum llama_example {
     LLAMA_EXAMPLE_LLAVA,
     LLAMA_EXAMPLE_LOOKUP,
     LLAMA_EXAMPLE_PARALLEL,
+    LLAMA_EXAMPLE_TTS,
 
     LLAMA_EXAMPLE_COUNT,
 };
@@ -95,6 +96,7 @@ enum common_sampler_type {
     COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
     COMMON_SAMPLER_TYPE_XTC         = 8,
     COMMON_SAMPLER_TYPE_INFILL      = 9,
+    COMMON_SAMPLER_TYPE_PENALTIES   = 10,
 };
 
 // dimensionality reduction methods, used by cvector-generator
@@ -130,7 +132,6 @@ struct common_params_sampling {
     int32_t mirostat           = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
     float   mirostat_tau       = 5.00f; // target entropy
     float   mirostat_eta       = 0.10f; // learning rate
-    bool    penalize_nl        = false; // consider newlines as a repeatable token
     bool    ignore_eos         = false;
     bool    no_perf            = false; // disable performance metrics
     bool    timing_per_token   = false;
@@ -139,6 +140,7 @@ struct common_params_sampling {
 
 
     std::vector<enum common_sampler_type> samplers = {
+        COMMON_SAMPLER_TYPE_PENALTIES,
         COMMON_SAMPLER_TYPE_DRY,
         COMMON_SAMPLER_TYPE_TOP_K,
         COMMON_SAMPLER_TYPE_TYPICAL_P,
@@ -158,6 +160,7 @@ struct common_params_sampling {
 
 struct common_params_speculative {
     std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
+
     int32_t n_ctx        =     0; // draft context size
     int32_t n_max        =    16; // maximum number of tokens to draft during speculative decoding
     int32_t n_min        =     5; // minimum number of draft tokens to use for speculative decoding
@@ -171,6 +174,14 @@ struct common_params_speculative {
     std::string model = ""; // draft model for speculative decoding                          // NOLINT
 };
 
+struct common_params_vocoder {
+    std::string hf_repo = ""; // HF repo                                                     // NOLINT
+    std::string hf_file = ""; // HF file                                                     // NOLINT
+
+    std::string model     = ""; // model path                                                // NOLINT
+    std::string model_url = ""; // model url to download                                     // NOLINT
+};
+
 struct common_params {
     int32_t n_predict             =    -1; // new tokens to predict
     int32_t n_ctx                 =  4096; // context size
@@ -193,11 +204,13 @@ struct common_params {
     float   defrag_thold          =  0.1f; // KV cache defragmentation threshold
 
     // offload params
-    std::vector<ggml_backend_dev_t> devices;         // devices to use for offloading
-    int32_t n_gpu_layers                    =    -1; // number of layers to store in VRAM (-1 - use default)
-    int32_t main_gpu                        =     0; // the GPU that is used for scratch and small tensors
-    float   tensor_split[128]               =   {0}; // how split tensors should be distributed across GPUs
-    enum llama_split_mode        split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
+    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
+
+    int32_t n_gpu_layers      = -1;  // number of layers to store in VRAM (-1 - use default)
+    int32_t main_gpu          = 0;   // the GPU that is used for scratch and small tensors
+    float   tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
+
+    enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
 
     struct cpu_params cpuparams;
     struct cpu_params cpuparams_batch;
@@ -211,8 +224,9 @@ struct common_params {
     enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
     enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
 
-    struct common_params_sampling sampling;
+    struct common_params_sampling    sampling;
     struct common_params_speculative speculative;
+    struct common_params_vocoder     vocoder;
 
     std::string model                = ""; // model path                                                    // NOLINT
     std::string model_alias          = ""; // model alias                                                   // NOLINT
@@ -286,8 +300,8 @@ struct common_params {
     bool warmup            = true;  // warmup run
     bool check_tensors     = false; // validate tensor data
 
-    std::string cache_type_k = "f16"; // KV cache data type for the K
-    std::string cache_type_v = "f16"; // KV cache data type for the V
+    ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
+    ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
 
     // multimodal models (see examples/llava)
     std::string mmproj = "";        // path to multimodal projector                                         // NOLINT
@@ -437,6 +451,11 @@ std::vector<std::string> string_split<std::string>(const std::string & input, ch
     return parts;
 }
 
+static bool string_starts_with(const std::string & str,
+                               const std::string & prefix) {  // While we wait for C++20's std::string::starts_with...
+    return str.rfind(prefix, 0) == 0;
+}
+
 bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
 void string_process_escapes(std::string & input);
 
@@ -588,7 +607,8 @@ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_si
 // Embedding utils
 //
 
-void common_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
+// TODO: repace embd_norm with an enum
+void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
 
 float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
 
diff --git a/common/sampling.cpp b/common/sampling.cpp
index bb09be4bbca..6328624e63c 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -183,21 +183,21 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
                     params.penalty_present,
                     params.penalize_nl,
                     params.ignore_eos));
-
+  
     if (params.mirostat == 0) {
         for (const auto & cnstr : params.samplers) {
             switch (cnstr) {
-                    case COMMON_SAMPLER_TYPE_DRY:
+                case COMMON_SAMPLER_TYPE_DRY:
                     {
-                        std::vector<const char*> c_breakers;
+                        std::vector<const char *> c_breakers;
                         c_breakers.reserve(params.dry_sequence_breakers.size());
-                        for (const auto& str : params.dry_sequence_breakers) {
+                        for (const auto & str : params.dry_sequence_breakers) {
                             c_breakers.push_back(str.c_str());
                         }
 
                         llama_sampler_chain_add(result->chain, llama_sampler_init_dry      (model, params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
                     }
-                        break;
+                    break;
                 case COMMON_SAMPLER_TYPE_TOP_K:
                     llama_sampler_chain_add(result->chain, llama_sampler_init_top_k    (params.top_k));
                     break;
@@ -219,6 +219,9 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
                 case COMMON_SAMPLER_TYPE_INFILL:
                     llama_sampler_chain_add(result->chain, llama_sampler_init_infill   (model));
                     break;
+                case COMMON_SAMPLER_TYPE_PENALTIES:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
+                    break;
                 default:
                     GGML_ASSERT(false && "unknown sampler type");
             }
@@ -447,6 +450,7 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
         case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
         case COMMON_SAMPLER_TYPE_XTC:         return 'x';
         case COMMON_SAMPLER_TYPE_INFILL:      return 'i';
+        case COMMON_SAMPLER_TYPE_PENALTIES:   return 'e';
         default : return '?';
     }
 }
@@ -461,6 +465,7 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
         case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
         case COMMON_SAMPLER_TYPE_XTC:         return "xtc";
         case COMMON_SAMPLER_TYPE_INFILL:      return "infill";
+        case COMMON_SAMPLER_TYPE_PENALTIES:   return "penalties";
         default : return "";
     }
 }
@@ -475,6 +480,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
         { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
         { "xtc",         COMMON_SAMPLER_TYPE_XTC },
         { "infill",      COMMON_SAMPLER_TYPE_INFILL },
+        { "penalties",   COMMON_SAMPLER_TYPE_PENALTIES },
     };
 
     // since samplers names are written multiple ways
@@ -521,6 +527,7 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
         { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
         { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC),         COMMON_SAMPLER_TYPE_XTC },
         { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL),      COMMON_SAMPLER_TYPE_INFILL },
+        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_PENALTIES),   COMMON_SAMPLER_TYPE_PENALTIES },
     };
 
     std::vector<common_sampler_type> samplers;
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index c63d929c187..b6c15da94ec 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -221,17 +221,17 @@ def set_gguf_parameters(self):
             self.gguf_writer.add_context_length(n_ctx)
             logger.info(f"gguf: context length = {n_ctx}")
 
-        n_embd = self.find_hparam(["hidden_size", "n_embd"])
-        self.gguf_writer.add_embedding_length(n_embd)
-        logger.info(f"gguf: embedding length = {n_embd}")
+        if (n_embd := self.find_hparam(["hidden_size", "n_embd"], optional=True)) is not None:
+            self.gguf_writer.add_embedding_length(n_embd)
+            logger.info(f"gguf: embedding length = {n_embd}")
 
         if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None:
             self.gguf_writer.add_feed_forward_length(n_ff)
             logger.info(f"gguf: feed forward length = {n_ff}")
 
-        n_head = self.find_hparam(["num_attention_heads", "n_head"])
-        self.gguf_writer.add_head_count(n_head)
-        logger.info(f"gguf: head count = {n_head}")
+        if (n_head := self.find_hparam(["num_attention_heads", "n_head"], optional=True)) is not None:
+            self.gguf_writer.add_head_count(n_head)
+            logger.info(f"gguf: head count = {n_head}")
 
         if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
             self.gguf_writer.add_head_count_kv(n_head_kv)
@@ -296,7 +296,9 @@ def prepare_tensors(self):
                     break
 
             for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)):
-                data = data_torch.squeeze().numpy()
+                # TODO: why do we squeeze here?
+                # data = data_torch.squeeze().numpy()
+                data = data_torch.numpy()
 
                 # if data ends up empty, it means data_torch was a scalar tensor -> restore
                 if len(data.shape) == 0:
@@ -324,6 +326,8 @@ def prepare_tensors(self):
                             gguf.MODEL_TENSOR.TIME_MIX_W2,
                             gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1,
                             gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2,
+                            gguf.MODEL_TENSOR.POSNET_NORM1,
+                            gguf.MODEL_TENSOR.POSNET_NORM2,
                         )
                     )
                     or not new_name.endswith(".weight")
@@ -525,9 +529,19 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
             else:
                 token: str = reverse_vocab[i]
                 if token in added_vocab:
+                    # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
+                    # To avoid unexpected issues - we make sure to normalize non-normalized tokens
+                    if not tokenizer.added_tokens_decoder[i].normalized:
+                        previous_token = token
+                        token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
+                        if previous_token != token:
+                            logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
+
                     if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
                         toktypes.append(gguf.TokenType.CONTROL)
                     else:
+                        # NOTE: this was added for Gemma.
+                        # Encoding and decoding the tokens above isn't sufficient for this case.
                         token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ")  # pre-normalize user-defined spaces
                         toktypes.append(gguf.TokenType.USER_DEFINED)
                 else:
@@ -571,6 +585,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
             # ref: https://huggingface.co/tiiuae/falcon-7b
             res = "falcon"
+        if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e":
+            # ref: https://huggingface.co/tiiuae/Falcon3-7B-Base
+            res = "falcon3"
         if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
             # ref: https://huggingface.co/BAAI/bge-small-en-v1.5
             res = "bert-bge"
@@ -664,6 +681,12 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65":
             # ref: https://huggingface.co/sentence-transformers/stsb-roberta-base
             res = "roberta-bpe"
+        if chkhsh == "ad851be1dba641f2e3711822f816db2c265f788b37c63b4e1aeacb9ee92de8eb":
+            # ref: https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct
+            res = "gigachat"
+        if chkhsh == "d4c8f286ea6b520b3d495c4455483cfa2302c0cfcd4be05d781b6a8a0a7cdaf1":
+            # ref: https://huggingface.co/Infinigence/Megrez-3B-Instruct
+            res = "megrez"
 
         if res is None:
             logger.warning("\n")
@@ -686,6 +709,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         return res
         # Marker: End get_vocab_base_pre
 
+    def _set_vocab_none(self) -> None:
+        self.gguf_writer.add_tokenizer_model("none")
+
     def _set_vocab_gpt2(self) -> None:
         tokens, toktypes, tokpre = self.get_vocab_base()
         self.gguf_writer.add_tokenizer_model("gpt2")
@@ -1669,6 +1695,184 @@ def prepare_tensors(self):
                 raise ValueError(f"Unprocessed experts: {experts}")
 
 
+@Model.register("DeciLMForCausalLM")
+class DeciModel(Model):
+    model_arch = gguf.MODEL_ARCH.DECI
+
+    @staticmethod
+    def _ffn_mult_to_intermediate_size(ffn_mult: float, n_embd: int) -> int:
+        # DeciLM-specific code
+        intermediate_size = int(2 * ffn_mult * n_embd / 3)
+        return DeciModel._find_multiple(intermediate_size, 256)
+
+    @staticmethod
+    def _find_multiple(n: int, k: int) -> int:
+        # DeciLM-specific code
+        if n % k == 0:
+            return n
+        return n + k - (n % k)
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B
+            _block_configs: list[dict[str,Any]] = self.hparams["block_configs"]
+            assert self.block_count == len(_block_configs)
+            self._num_kv_heads = list()
+            self._num_heads = list()
+            _ffn_multipliers = list()
+            # ***linear attention layer***
+            # if n_heads_in_group is None and replace_with_linear is True
+            # then _num_kv_heads[il] is 0 and _num_heads[il] is num_attention_heads
+            # ***attention-free layer***
+            # if n_heads_in_group is None and replace_with_linear is False
+            # then _num_kv_heads[il] is 0 and _num_heads[il] is 0
+            # ***normal attention-layer***
+            # if n_heads_in_group is not None, then
+            # _num_kv_heads[il] is num_attention_head // n_heads_in_group and
+            # _num_heads[il] is num_attention_head
+            for il in range(len(_block_configs)):
+                if _block_configs[il]["attention"]["n_heads_in_group"] is None:
+                    if _block_configs[il]["attention"]["replace_with_linear"] is True:
+                        self._num_kv_heads.append(0)
+                        self._num_heads.append(self.hparams["num_attention_heads"])
+                    else:
+                        self._num_kv_heads.append(0)
+                        self._num_heads.append(0)
+                else:
+                    self._num_kv_heads.append(self.hparams["num_attention_heads"] // _block_configs[il]["attention"]["n_heads_in_group"])
+                    self._num_heads.append(self.hparams["num_attention_heads"])
+                _ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"])
+            assert self.block_count == len(self._num_kv_heads)
+            assert self.block_count == len(self._num_heads)
+            assert self.block_count == len(_ffn_multipliers)
+            assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int)
+            assert isinstance(self._num_heads, list) and isinstance(self._num_heads[0], int)
+            assert isinstance(_ffn_multipliers, list) and isinstance(_ffn_multipliers[0], float)
+            self._ffn_dims: list[int] = [
+                DeciModel._ffn_mult_to_intermediate_size(multiplier, self.hparams["hidden_size"])
+                for multiplier in _ffn_multipliers
+            ]
+
+    def set_vocab(self):
+        # Please change tokenizer_config.json of Llama-3_1-Nemotron-51B's
+        # eos_token from '|eot_id|' to '|end_of_text|'
+        if self.hparams.get("vocab_size", 128256) == 128256:
+            tokens, toktypes, tokpre = self.get_vocab_base()
+            self.gguf_writer.add_tokenizer_model("gpt2")
+            self.gguf_writer.add_tokenizer_pre(tokpre)
+            self.gguf_writer.add_token_list(tokens)
+            self.gguf_writer.add_token_types(toktypes)
+
+            special_vocab = gguf.SpecialVocab(
+                self.dir_model, load_merges=True,
+                special_token_types = ['bos', 'eos', 'eom', 'eot']
+            )
+            special_vocab._set_special_token("bos", 128000)
+            special_vocab._set_special_token("eos", 128001)
+            special_vocab._set_special_token("eom", 128008)
+            special_vocab._set_special_token("eot", 128009)
+            special_vocab.add_to_gguf(self.gguf_writer)
+        else:
+            # DeciLM-7B
+            self._set_vocab_llama_hf()
+#            self._set_vocab_gpt2()
+
+    def set_gguf_parameters(self):
+        if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B
+            assert self.block_count == len(self._num_kv_heads)
+            assert self.block_count == len(self._num_heads)
+            assert self.block_count == len(self._ffn_dims)
+            self.gguf_writer.add_head_count_kv(self._num_kv_heads)
+            self.gguf_writer.add_head_count(self._num_heads)
+            self.gguf_writer.add_feed_forward_length(self._ffn_dims)
+            self.gguf_writer.add_block_count(self.block_count)
+            self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
+            self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
+            self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
+            self.gguf_writer.add_key_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
+            self.gguf_writer.add_value_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
+            self.gguf_writer.add_file_type(self.ftype)
+        else: # DeciLM-7B
+            super().set_gguf_parameters()
+            if "num_key_value_heads_per_layer" in self.hparams: # DeciLM-7B
+                self._num_kv_heads: list[int] = self.hparams["num_key_value_heads_per_layer"]
+                assert self.block_count == len(self._num_kv_heads)
+                self.gguf_writer.add_head_count_kv(self._num_kv_heads)
+        hparams = self.hparams
+        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
+
+        if "head_dim" in hparams:
+            rope_dim = hparams["head_dim"]
+        else:
+            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
+        self.gguf_writer.add_rope_dimension_count(rope_dim)
+
+        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
+            if self.hparams["rope_scaling"].get("type") == "linear":
+                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
+                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
+
+    @staticmethod
+    def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
+        if n_head_kv is not None and n_head != n_head_kv:
+            n_head = n_head_kv
+        return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
+                .swapaxes(1, 2)
+                .reshape(weights.shape))
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        n_head = self.hparams["num_attention_heads"]
+        if bid is not None:
+            if "num_key_value_heads_per_layer" in self.hparams:
+                n_kv_head = self.hparams["num_key_value_heads_per_layer"][bid]
+            elif "block_configs" in self.hparams:
+                n_kv_head = self._num_kv_heads[bid]
+                n_head = self._num_heads[bid]
+            else:
+                n_kv_head = self.hparams.get("num_key_value_heads")
+        else:
+            n_kv_head = self.hparams.get("num_key_value_heads")
+
+        if name.endswith(("q_proj.weight", "q_proj.bias")):
+            data_torch = DeciModel.permute(data_torch, n_head, n_head)
+        if name.endswith(("k_proj.weight", "k_proj.bias")):
+            data_torch = DeciModel.permute(data_torch, n_head, n_kv_head)
+        return [(self.map_tensor_name(name), data_torch)]
+
+    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
+        if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
+            if rope_scaling.get("rope_type", '').lower() == "llama3":
+                base = self.hparams.get("rope_theta", 10000.0)
+                dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
+                freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
+
+                factor = rope_scaling.get("factor", 8.0)
+                low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
+                high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
+                old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
+
+                low_freq_wavelen = old_context_len / low_freq_factor
+                high_freq_wavelen = old_context_len / high_freq_factor
+                assert low_freq_wavelen != high_freq_wavelen
+
+                rope_factors = []
+                for freq in freqs:
+                    wavelen = 2 * math.pi / freq
+                    if wavelen < high_freq_wavelen:
+                        rope_factors.append(1)
+                    elif wavelen > low_freq_wavelen:
+                        rope_factors.append(factor)
+                    else:
+                        smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
+                        rope_factors.append(1 / ((1 - smooth) / factor + smooth))
+
+                yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
+
+    def prepare_tensors(self):
+        super().prepare_tensors()
+
+
 @Model.register("BitnetForCausalLM")
 class BitnetModel(Model):
     model_arch = gguf.MODEL_ARCH.BITNET
@@ -2001,6 +2205,67 @@ def set_gguf_parameters(self):
                 self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
 
 
+@Model.register("Qwen2VLForConditionalGeneration")
+class Qwen2VLModel(Model):
+    model_arch = gguf.MODEL_ARCH.QWEN2VL
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        mrope_section = self.hparams["rope_scaling"]["mrope_section"]
+        mrope_section += [0] * max(0, 4 - len(mrope_section))
+        self.gguf_writer.add_rope_dimension_sections(mrope_section)
+
+    def set_vocab(self):
+        try:
+            self._set_vocab_sentencepiece()
+        except FileNotFoundError:
+            self._set_vocab_gpt2()
+
+    def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
+        for name, data in super().get_tensors():
+            if name.startswith("visual."):
+                continue
+            yield name, data
+
+
+@Model.register("WavTokenizerDec")
+class WavTokenizerDecModel(Model):
+    model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unused
+
+        if \
+                name.endswith("codebook.cluster_size") or \
+                name.endswith("codebook.embed_avg") or \
+                name.endswith("codebook.inited"):
+            logger.debug(f"Skipping {name!r}")
+            return []
+
+        logger.info(f"{self.map_tensor_name(name)} -> {data_torch.shape}")
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+    def set_vocab(self):
+        self._set_vocab_none()
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_vocab_size         (self.hparams["vocab_size"])
+        self.gguf_writer.add_features_length    (self.hparams["n_embd_features"])
+        self.gguf_writer.add_feed_forward_length(self.hparams["n_ff"])
+        self.gguf_writer.add_group_norm_eps     (self.hparams["group_norm_epsilon"])
+        self.gguf_writer.add_group_norm_groups  (self.hparams["group_norm_groups"])
+
+        self.gguf_writer.add_posnet_embedding_length(self.hparams["posnet"]["n_embd"])
+        self.gguf_writer.add_posnet_block_count     (self.hparams["posnet"]["n_layer"])
+
+        self.gguf_writer.add_convnext_embedding_length(self.hparams["convnext"]["n_embd"])
+        self.gguf_writer.add_convnext_block_count     (self.hparams["convnext"]["n_layer"])
+
+        self.gguf_writer.add_causal_attention(False)
+
+
 @Model.register("Qwen2MoeForCausalLM")
 class Qwen2MoeModel(Model):
     model_arch = gguf.MODEL_ARCH.QWEN2MOE
@@ -2129,6 +2394,15 @@ class Phi3MiniModel(Model):
     model_arch = gguf.MODEL_ARCH.PHI3
 
     def set_vocab(self):
+        # Phi-4 model uses GPT2Tokenizer
+        tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
+        if tokenizer_config_file.is_file():
+            with open(tokenizer_config_file, "r", encoding="utf-8") as f:
+                tokenizer_config_json = json.load(f)
+                tokenizer_class = tokenizer_config_json['tokenizer_class']
+                if tokenizer_class == 'GPT2Tokenizer':
+                    return self._set_vocab_gpt2()
+
         from sentencepiece import SentencePieceProcessor
 
         tokenizer_path = self.dir_model / 'tokenizer.model'
@@ -2245,7 +2519,11 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_rope_dimension_count(rope_dims)
         self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
         self.gguf_writer.add_file_type(self.ftype)
-        self.gguf_writer.add_sliding_window(self.find_hparam(["sliding_window"]))
+        sliding_window = self.hparams.get("sliding_window")
+        # use zero value of sliding_window to distinguish Phi-4 from other PHI3 models
+        if sliding_window is None:
+            sliding_window = 0
+        self.gguf_writer.add_sliding_window(sliding_window)
 
     def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
         n_embd = self.find_hparam(["hidden_size", "n_embd"])
@@ -2544,7 +2822,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
             return [(self.map_tensor_name(name), data_torch)]
 
 
-@Model.register("BertModel", "CamembertModel", "RobertaModel")
+@Model.register("BertModel", "BertForMaskedLM", "CamembertModel")
 class BertModel(Model):
     model_arch = gguf.MODEL_ARCH.BERT
 
@@ -2610,13 +2888,73 @@ def phantom(tok):
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         del bid  # unused
 
+        if name.startswith("bert."):
+            name = name[5:]
+
+        if name.endswith(".gamma"):
+            name = name[:-6] + ".weight"
+
+        if name.endswith(".beta"):
+            name = name[:-5] + ".bias"
+
         # we are only using BERT for embeddings so we don't need the pooling layer
         if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"):
             return [] # we don't need these
 
+        if name.startswith("cls.predictions"):
+            return []
+
+        if name.startswith("cls.seq_relationship"):
+            return []
+
         return [(self.map_tensor_name(name), data_torch)]
 
 
+@Model.register("RobertaModel")
+class RobertaModel(BertModel):
+    model_arch = gguf.MODEL_ARCH.BERT
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # we need the pad_token_id to know how to chop down position_embd matrix
+        if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
+            self._position_offset = 1 + pad_token_id
+            if "max_position_embeddings" in self.hparams:
+                self.hparams["max_position_embeddings"] -= self._position_offset
+        else:
+            self._position_offset = None
+
+    def set_vocab(self):
+        """Support BPE tokenizers for roberta models"""
+        bpe_tok_path = self.dir_model / "tokenizer.json"
+        if bpe_tok_path.exists():
+            self._set_vocab_gpt2()
+            self.gguf_writer.add_add_bos_token(True)
+            self.gguf_writer.add_add_eos_token(True)
+
+            # we need this to validate the size of the token_type embeddings
+            # though currently we are passing all zeros to the token_type embeddings
+            # "Sequence A" or "Sequence B"
+            self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
+
+        else:
+            return super().set_vocab()
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # if name starts with "roberta.", remove the prefix
+        # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
+        if name.startswith("roberta."):
+            name = name[8:]
+
+        # position embeddings start at pad_token_id + 1, so just chop down the weight tensor
+        if name == "embeddings.position_embeddings.weight":
+            if self._position_offset is not None:
+                data_torch = data_torch[self._position_offset:,:]
+
+        return super().modify_tensors(data_torch, name, bid)
+
+
 @Model.register("NomicBertModel")
 class NomicBertModel(BertModel):
     model_arch = gguf.MODEL_ARCH.NOMIC_BERT
@@ -2936,6 +3274,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         if new_name.endswith("time_mix_w2.weight"):
             data_torch = data_torch.permute(0, 2, 1)
 
+        if new_name.endswith("time_mix_decay.weight") or "lerp" in new_name:
+            data_torch = data_torch.squeeze()
+
         rescale_every_n_layers = self.hparams["rescale_every"]
         if rescale_every_n_layers > 0:
             if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"):
@@ -3404,6 +3745,97 @@ def prepare_tensors(self):
                 raise ValueError(f"Unprocessed experts: {experts}")
 
 
+@Model.register("DeepseekForCausalLM")
+class DeepseekModel(Model):
+    model_arch = gguf.MODEL_ARCH.DEEPSEEK
+
+    def set_vocab(self):
+        try:
+            self._set_vocab_sentencepiece()
+        except FileNotFoundError:
+            self._set_vocab_gpt2()
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        hparams = self.hparams
+        if "head_dim" in hparams:
+            rope_dim = hparams["head_dim"]
+        else:
+            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
+
+        self.gguf_writer.add_rope_dimension_count(rope_dim)
+        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
+        self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
+        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
+        self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
+        self.gguf_writer.add_expert_weights_scale(1.0)
+        self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
+        self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
+
+    _experts: list[dict[str, Tensor]] | None = None
+
+    @staticmethod
+    def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
+        if n_head_kv is not None and n_head != n_head_kv:
+            n_head = n_head_kv
+        return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
+                .swapaxes(1, 2)
+                .reshape(weights.shape))
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        n_head = self.hparams["num_attention_heads"]
+        n_kv_head = self.hparams.get("num_key_value_heads")
+
+        if name.endswith(("q_proj.weight", "q_proj.bias")):
+            data_torch = DeepseekModel.permute(data_torch, n_head, n_head)
+        if name.endswith(("k_proj.weight", "k_proj.bias")):
+            data_torch = DeepseekModel.permute(data_torch, n_head, n_kv_head)
+
+        # process the experts separately
+        if name.find("mlp.experts") != -1:
+            n_experts = self.hparams["n_routed_experts"]
+            assert bid is not None
+
+            if self._experts is None:
+                self._experts = [{} for _ in range(self.block_count)]
+
+            self._experts[bid][name] = data_torch
+
+            if len(self._experts[bid]) >= n_experts * 3:
+                tensors: list[tuple[str, Tensor]] = []
+
+                # merge the experts into a single 3d tensor
+                for w_name in ["down_proj", "gate_proj", "up_proj"]:
+                    datas: list[Tensor] = []
+
+                    for xid in range(n_experts):
+                        ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
+                        datas.append(self._experts[bid][ename])
+                        del self._experts[bid][ename]
+
+                    data_torch = torch.stack(datas, dim=0)
+
+                    merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
+
+                    new_name = self.map_tensor_name(merged_name)
+
+                    tensors.append((new_name, data_torch))
+                return tensors
+            else:
+                return []
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+    def prepare_tensors(self):
+        super().prepare_tensors()
+
+        if self._experts is not None:
+            # flatten `list[dict[str, Tensor]]` into `list[str]`
+            experts = [k for d in self._experts for k in d.keys()]
+            if len(experts) > 0:
+                raise ValueError(f"Unprocessed experts: {experts}")
+
+
 @Model.register("DeepseekV2ForCausalLM")
 class DeepseekV2Model(Model):
     model_arch = gguf.MODEL_ARCH.DEEPSEEK2
diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py
index aa10e5db796..fea23ddb4ae 100755
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -72,6 +72,7 @@ class TOKENIZER_TYPE(IntEnum):
     {"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
     {"name": "falcon",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
     {"name": "bert-bge",       "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
+    {"name": "falcon3",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon3-7B-Base", },
     {"name": "bert-bge-large", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/BAAI/bge-large-zh-v1.5", },
     {"name": "mpt",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
     {"name": "starcoder",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
@@ -104,6 +105,8 @@ class TOKENIZER_TYPE(IntEnum):
     {"name": "chameleon",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },
     {"name": "minerva-7b",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", },
     {"name": "roberta-bpe",    "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"},
+    {"name": "gigachat",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct"},
+    {"name": "megrez",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"},
 ]
 
 
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 9210e9fea5e..66cfab2c3b7 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -20,7 +20,12 @@ else()
     add_subdirectory(batched)
     add_subdirectory(embedding)
     add_subdirectory(eval-callback)
-    add_subdirectory(gbnf-validator)
+
+    if (NOT WIN32)
+        # disabled on Windows because it uses internal functions not exported with LLAMA_API
+        add_subdirectory(gbnf-validator)
+    endif()
+
     add_subdirectory(gguf-hash)
     add_subdirectory(gguf-split)
     add_subdirectory(gguf)
@@ -46,12 +51,17 @@ else()
     add_subdirectory(speculative)
     add_subdirectory(speculative-simple)
     add_subdirectory(tokenize)
+    add_subdirectory(tts)
+    add_subdirectory(gen-docs)
     if (NOT GGML_BACKEND_DL)
         # these examples use the backends directly and cannot be built with dynamic loading
         add_subdirectory(convert-llama2c-to-ggml)
         add_subdirectory(cvector-generator)
         add_subdirectory(export-lora)
-        add_subdirectory(quantize-stats)
+        if (NOT WIN32)
+            # disabled on Windows because it uses internal functions not exported with LLAMA_API
+            add_subdirectory(quantize-stats)
+        endif()
         add_subdirectory(llava)
         if (GGML_RPC)
             add_subdirectory(rpc)
diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp
index ba219cd4b32..e2e01f2d598 100644
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -65,6 +65,7 @@ int main(int argc, char ** argv) {
     llama_context * ctx = llama_new_context_with_model(model, ctx_params);
 
     auto sparams = llama_sampler_chain_default_params();
+    sparams.no_perf = false;
 
     llama_sampler * smpl = llama_sampler_chain_init(sparams);
 
diff --git a/examples/cvector-generator/mean.hpp b/examples/cvector-generator/mean.hpp
index 16be5ce3eec..4eeac1eeb7a 100644
--- a/examples/cvector-generator/mean.hpp
+++ b/examples/cvector-generator/mean.hpp
@@ -15,7 +15,7 @@ static void run(
     for (size_t il = 0; il < v_input.size(); ++il) {
         // prepare output vector
         struct ggml_tensor * ctrl_out = v_output[il];
-        ggml_format_name(ctrl_out, "direction.%ld", il+1);
+        ggml_format_name(ctrl_out, "direction.%zu", il+1);
 
         // calculate mean vector
         struct ggml_tensor * t_layer = v_input[il];
diff --git a/examples/cvector-generator/pca.hpp b/examples/cvector-generator/pca.hpp
index f6e307fbc49..e88bbdde93f 100644
--- a/examples/cvector-generator/pca.hpp
+++ b/examples/cvector-generator/pca.hpp
@@ -302,7 +302,7 @@ static void run_pca(
 
         // prepare output vector
         struct ggml_tensor * ctrl_out = v_output[il];
-        ggml_format_name(ctrl_out, "direction.%ld", il+1);
+        ggml_format_name(ctrl_out, "direction.%zu", il+1);
 
         // run power_iteration
         params.i_layer = il;
diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp
index 67662313d07..058b5cc8602 100644
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -265,8 +265,8 @@ struct lora_merge_ctx {
             fout.write((const char *)data.data(), data.size());
         }
 
-        printf("%s : merged %ld tensors with lora adapters\n", __func__, n_merged);
-        printf("%s : wrote %ld tensors to output file\n", __func__, trans.size());
+        printf("%s : merged %zu tensors with lora adapters\n", __func__, n_merged);
+        printf("%s : wrote %zu tensors to output file\n", __func__, trans.size());
     }
 
     void copy_tensor(struct ggml_tensor * base) {
@@ -352,7 +352,7 @@ struct lora_merge_ctx {
                 const float scale = alpha ? adapters[i]->scale * alpha / rank : adapters[i]->scale;
                 delta = ggml_scale(ctx0, delta, scale);
                 cur = ggml_add(ctx0, delta, cur);
-                printf("%s :   + merging from adapter[%ld] type=%s\n", __func__, i, ggml_type_name(inp_a[i]->type));
+                printf("%s :   + merging from adapter[%zu] type=%s\n", __func__, i, ggml_type_name(inp_a[i]->type));
                 printf("%s :     input_scale=%f calculated_scale=%f rank=%d\n", __func__, adapters[i]->scale, scale, (int) inp_b[i]->ne[0]);
             }
             cur = ggml_cast(ctx0, cur, out->type);
diff --git a/examples/gbnf-validator/gbnf-validator.cpp b/examples/gbnf-validator/gbnf-validator.cpp
index 7493af9d3ae..17a0e27c444 100644
--- a/examples/gbnf-validator/gbnf-validator.cpp
+++ b/examples/gbnf-validator/gbnf-validator.cpp
@@ -11,19 +11,15 @@
 static bool llama_grammar_validate(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) {
     const auto cpts = unicode_cpts_from_utf8(input_str);
 
-    const llama_grammar_rules  & rules      = llama_grammar_get_rules (grammar);
-          llama_grammar_stacks & stacks_cur = llama_grammar_get_stacks(grammar);
+    auto & stacks_cur = llama_grammar_get_stacks(grammar);
 
     size_t pos = 0;
     for (const auto & cpt : cpts) {
-        const llama_grammar_stacks stacks_prev = llama_grammar_get_stacks(grammar); // copy
-
-        llama_grammar_accept(rules, stacks_prev, cpt, stacks_cur);
+        llama_grammar_accept(grammar, cpt);
 
         if (stacks_cur.empty()) {
             error_pos = pos;
             error_msg = "Unexpected character '" + unicode_cpt_to_utf8(cpt) + "'";
-            stacks_cur = stacks_prev;
             return false;
         }
         ++pos;
@@ -82,7 +78,8 @@ int main(int argc, char** argv) {
 
     llama_grammar * grammar = llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root");
     if (grammar == nullptr) {
-        throw std::runtime_error("Failed to initialize llama_grammar");
+        fprintf(stdout, "Failed to initialize llama_grammar\n");
+        return 1;
     }
     // Read the input file
     std::string input_str;
diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp
index 7e62657e118..75f63f93891 100644
--- a/examples/gguf-split/gguf-split.cpp
+++ b/examples/gguf-split/gguf-split.cpp
@@ -287,7 +287,7 @@ struct split_strategy {
     }
 
     void print_info() {
-        printf("n_split: %ld\n", ctx_outs.size());
+        printf("n_split: %zu\n", ctx_outs.size());
         int i_split = 0;
         for (auto & ctx_out : ctx_outs) {
             // re-calculate the real gguf size for each split (= metadata size + total size of all tensors)
@@ -297,7 +297,7 @@ struct split_strategy {
                 total_size += ggml_nbytes(t);
             }
             total_size = total_size / 1000 / 1000; // convert to megabytes
-            printf("split %05d: n_tensors = %d, total_size = %ldM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
+            printf("split %05d: n_tensors = %d, total_size = %zuM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
             i_split++;
         }
     }
diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp
index 6e42fa0734e..18a945b3390 100644
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -75,7 +75,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
         }
 
         std::vector<float> emb_norm(emb_unorm.size());
-        common_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd);
+        common_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd, 2);
         result.push_back(emb_norm);
 
 #ifdef GRIT_DEBUG
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index bac606f4716..2338ad1067d 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -1521,7 +1521,7 @@ int main(int argc, char ** argv) {
     for (const auto & inst : params_instances) {
         params_idx++;
         if (params.progress) {
-            fprintf(stderr, "llama-bench: benchmark %d/%ld: starting\n", params_idx, params_count);
+            fprintf(stderr, "llama-bench: benchmark %d/%zu: starting\n", params_idx, params_count);
         }
         // keep the same model between tests when possible
         if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
@@ -1573,14 +1573,14 @@ int main(int argc, char ** argv) {
         // warmup run
         if (t.n_prompt > 0) {
             if (params.progress) {
-                fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup prompt run\n", params_idx, params_count);
+                fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup prompt run\n", params_idx, params_count);
             }
             //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
             test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
         }
         if (t.n_gen > 0) {
             if (params.progress) {
-                fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup generation run\n", params_idx, params_count);
+                fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup generation run\n", params_idx, params_count);
             }
             test_gen(ctx, 1, t.n_threads);
         }
@@ -1592,14 +1592,14 @@ int main(int argc, char ** argv) {
 
             if (t.n_prompt > 0) {
                 if (params.progress) {
-                    fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count,
+                    fprintf(stderr, "llama-bench: benchmark %d/%zu: prompt run %d/%d\n", params_idx, params_count,
                             i + 1, params.reps);
                 }
                 test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
             }
             if (t.n_gen > 0) {
                 if (params.progress) {
-                    fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count,
+                    fprintf(stderr, "llama-bench: benchmark %d/%zu: generation run %d/%d\n", params_idx, params_count,
                             i + 1, params.reps);
                 }
                 test_gen(ctx, t.n_gen, t.n_threads);
diff --git a/examples/llama.android/llama/build.gradle.kts b/examples/llama.android/llama/build.gradle.kts
index 2d1dfba2040..28dbc190488 100644
--- a/examples/llama.android/llama/build.gradle.kts
+++ b/examples/llama.android/llama/build.gradle.kts
@@ -19,6 +19,7 @@ android {
         externalNativeBuild {
             cmake {
                 arguments += "-DLLAMA_BUILD_COMMON=ON"
+                arguments += "-DGGML_LLAMAFILE=OFF"
                 arguments += "-DCMAKE_BUILD_TYPE=Release"
                 cppFlags += listOf()
                 arguments += listOf()
diff --git a/examples/llava/CMakeLists.txt b/examples/llava/CMakeLists.txt
index 5d32f377f26..3ce0d60c80d 100644
--- a/examples/llava/CMakeLists.txt
+++ b/examples/llava/CMakeLists.txt
@@ -43,3 +43,10 @@ set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-minicpmv-cli)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+set(TARGET llama-qwen2vl-cli)
+add_executable(${TARGET} qwen2vl-cli.cpp)
+set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-qwen2vl-cli)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index d7c94352b56..3cd0d2fa859 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -8,25 +8,25 @@
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
 
-#ifdef GGML_USE_CUDA
-#include "ggml-cuda.h"
-#endif
-
-#ifdef GGML_USE_SYCL
-#include "ggml-sycl.h"
-#endif
-
-#ifdef GGML_USE_METAL
-#include "ggml-metal.h"
-#endif
-
-#ifdef GGML_USE_CANN
-#include "ggml-cann.h"
-#endif
-
-#ifdef GGML_USE_VULKAN
-#include "ggml-vulkan.h"
-#endif
+//#ifdef GGML_USE_CUDA
+//#include "ggml-cuda.h"
+//#endif
+//
+//#ifdef GGML_USE_SYCL
+//#include "ggml-sycl.h"
+//#endif
+//
+//#ifdef GGML_USE_METAL
+//#include "ggml-metal.h"
+//#endif
+//
+//#ifdef GGML_USE_CANN
+//#include "ggml-cann.h"
+//#endif
+//
+//#ifdef GGML_USE_VULKAN
+//#include "ggml-vulkan.h"
+//#endif
 
 #define STB_IMAGE_IMPLEMENTATION
 #include "stb_image.h"
@@ -102,7 +102,9 @@ static std::string format(const char * fmt, ...) {
 #define KEY_HAS_LLAVA_PROJ      "clip.has_llava_projector"
 #define KEY_HAS_MINICPMV_PROJ   "clip.has_minicpmv_projector"
 #define KEY_MINICPMV_VERSION    "clip.minicpmv_version"
+#define KEY_HAS_QWEN2VL_MERGER  "clip.has_qwen2vl_merger"
 #define KEY_USE_GELU            "clip.use_gelu"
+#define KEY_USE_SILU            "clip.use_silu"
 #define KEY_N_EMBD              "clip.%s.embedding_length"
 #define KEY_N_FF                "clip.%s.feed_forward_length"
 #define KEY_N_BLOCK             "clip.%s.block_count"
@@ -129,7 +131,8 @@ static std::string format(const char * fmt, ...) {
 #define TN_TOKEN_EMBD      "%s.token_embd.weight"
 #define TN_POS_EMBD        "%s.position_embd.weight"
 #define TN_CLASS_EMBD      "v.class_embd"
-#define TN_PATCH_EMBD      "v.patch_embd.weight"
+#define TN_PATCH_EMBD      "v.patch_embd.weight"  // not rename tensor with ".0" postfix for backwrad compat
+#define TN_PATCH_EMBD_1    "v.patch_embd.weight.1"
 #define TN_PATCH_BIAS      "v.patch_embd.bias"
 #define TN_ATTN_K          "%s.blk.%d.attn_k.%s"
 #define TN_ATTN_Q          "%s.blk.%d.attn_q.%s"
@@ -163,6 +166,7 @@ enum projector_type {
     PROJECTOR_TYPE_LDP,
     PROJECTOR_TYPE_LDPV2,
     PROJECTOR_TYPE_RESAMPLER,
+    PROJECTOR_TYPE_MERGER,
     PROJECTOR_TYPE_UNKNOWN,
 };
 
@@ -171,6 +175,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
     { PROJECTOR_TYPE_LDP, "ldp" },
     { PROJECTOR_TYPE_LDPV2, "ldpv2"},
     { PROJECTOR_TYPE_RESAMPLER, "resampler"},
+    { PROJECTOR_TYPE_MERGER, "qwen2vl_merger"},
 };
 
 
@@ -463,7 +468,8 @@ struct clip_vision_model {
 
     // embeddings
     struct ggml_tensor * class_embedding;
-    struct ggml_tensor * patch_embeddings;
+    struct ggml_tensor * patch_embeddings_0;
+    struct ggml_tensor * patch_embeddings_1;  // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
     struct ggml_tensor * patch_bias;
     struct ggml_tensor * position_embeddings;
 
@@ -553,6 +559,7 @@ struct clip_ctx {
     bool has_vision_encoder  = false;
     bool has_llava_projector = false;
     bool has_minicpmv_projector = false;
+    bool has_qwen2vl_merger = false;
     int minicpmv_version = 2;
 
     struct clip_vision_model vision_model;
@@ -561,6 +568,7 @@ struct clip_ctx {
     float image_mean[3];
     float image_std[3];
     bool use_gelu = false;
+    bool use_silu = false;
     int32_t ftype = 1;
 
     bool has_class_embedding = true;
@@ -606,14 +614,26 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             image_size_height = imgs->data->ny;
         }
     }
+    else if (ctx->has_qwen2vl_merger) {
+        // use the image's native resolution when image is avaible
+        if (is_inf) {
+        // if (imgs->data->nx && imgs->data->ny) {
+            image_size_width  = imgs->data->nx;
+            image_size_height = imgs->data->ny;
+        }
+    }
     const int patch_size           = hparams.patch_size;
     const int num_patches          = ((image_size_width / patch_size) * (image_size_height / patch_size));
+    const int patches_w            = image_size_width / patch_size;
+    const int patches_h            = image_size_height / patch_size;
     const int num_positions        = num_patches + (ctx->has_class_embedding ? 1 : 0);
+    const int num_position_ids     = ctx->has_qwen2vl_merger ? num_positions * 4 : num_positions;
     const int hidden_size          = hparams.hidden_size;
     const int n_head               = hparams.n_head;
     const int d_head               = hidden_size / n_head;
     int n_layer                    = hparams.n_layer;
     const float eps                = hparams.eps;
+    int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
 
     const int batch_size = imgs->size;
 
@@ -634,10 +654,30 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
     ggml_set_name(inp_raw, "inp_raw");
     ggml_set_input(inp_raw);
 
-    struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+    struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
 
-    inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
-    inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
+    if (ctx->has_qwen2vl_merger) {
+        GGML_ASSERT(image_size_width % (patch_size * 2) == 0);
+        GGML_ASSERT(image_size_height % (patch_size * 2) == 0);
+
+        auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+        inp = ggml_add(ctx0, inp, inp_1);
+        inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 2, 0, 3));  // [w, h, c, b] -> [c, w, h, b]
+        inp = ggml_reshape_4d(
+            ctx0, inp,
+            hidden_size * 2, patches_w / 2, patches_h, batch_size);
+        inp = ggml_reshape_4d(
+            ctx0, inp,
+            hidden_size * 2, patches_w / 2, 2, batch_size * (patches_h / 2));
+        inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3));
+        inp = ggml_reshape_3d(
+            ctx0, inp,
+            hidden_size, patches_w * patches_h, batch_size);
+    }
+    else {
+        inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
+        inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
+    }
 
     if (ctx->has_patch_bias) {
         // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
@@ -659,12 +699,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
         }
     }
 
-    struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
+    struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
     ggml_set_name(positions, "positions");
     ggml_set_input(positions);
 
-    embeddings =
-        ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
+    if (!ctx->has_qwen2vl_merger) { // qwen2vl use rope position embedding
+        embeddings =
+            ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
+    }
 
     if (ctx->has_minicpmv_projector) {
         int pos_w = image_size_width/patch_size;
@@ -688,7 +730,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
     }
 
     // loop over layers
-    if (ctx->has_minicpmv_projector) {
+    if (ctx->has_minicpmv_projector || ctx->has_qwen2vl_merger) {
+        // TODO: figure out why we doing thing in this way ???
         n_layer += 1;
     }
     for (int il = 0; il < n_layer - 1; il++) {
@@ -710,8 +753,13 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             struct ggml_tensor * Q =
                 ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);
 
-            Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
             Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size);
+            if (ctx->has_qwen2vl_merger) {
+                Q = ggml_rope_multi(
+                    ctx0, Q, positions, nullptr,
+                    d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
+            }
+            Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
             Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
             Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size);
 
@@ -719,6 +767,11 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
                 ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);
 
             K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
+            if (ctx->has_qwen2vl_merger) {
+                K = ggml_rope_multi(
+                    ctx0, K, positions, nullptr,
+                    d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
+            }
             K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
             K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
 
@@ -758,6 +811,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
 
         if (ctx->use_gelu) {
             cur = ggml_gelu_inplace(ctx0, cur);
+        } else if (ctx->use_silu) {
+            cur = ggml_silu_inplace(ctx0, cur);
         } else {
             cur = ggml_gelu_quick_inplace(ctx0, cur);
         }
@@ -769,6 +824,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
         cur = ggml_add(ctx0, embeddings, cur);
 
         embeddings = cur;
+
     }
 
     // post-layernorm
@@ -840,7 +896,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
                 mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3));
                 mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
                 // stride = 1, padding = 1, bias is nullptr
-                block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
+                block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
 
                 // layer norm
                 // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
@@ -888,7 +944,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             // block_2
             {
                 // stride = 2
-                block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
+                block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
 
                 // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
                 // layer norm
@@ -949,7 +1005,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             // mlp_2 ne [24, 24, 2048, 1]
             mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
             // weight ne = [3, 3, 2048, 1]
-            struct ggml_tensor * peg_0 = ggml_conv_depthwise_2d(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
+            struct ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
             peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
             peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
             mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3));
@@ -1030,6 +1086,19 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             GGML_ASSERT(false);
         }
     }
+    else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
+        embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size);
+
+        embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
+        embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
+
+        // GELU activation
+        embeddings = ggml_gelu(ctx0, embeddings);
+
+        // Second linear layer
+        embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
+        embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
+    }
 
     // build the graph
     ggml_build_forward_expand(gf, embeddings);
@@ -1153,30 +1222,30 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         }
     }
 
-#ifdef GGML_USE_CUDA
-    new_clip->backend = ggml_backend_cuda_init(0);
-    LOG_INF("%s: CLIP using CUDA backend\n", __func__);
-#endif
-
-#ifdef GGML_USE_METAL
-    new_clip->backend = ggml_backend_metal_init();
-    LOG_INF("%s: CLIP using Metal backend\n", __func__);
-#endif
-
-#ifdef GGML_USE_CANN
-    new_clip->backend = ggml_backend_cann_init(0);
-    LOG_INF("%s: CLIP using CANN backend\n", __func__);
-#endif
-
-#ifdef GGML_USE_VULKAN
-    new_clip->backend = ggml_backend_vk_init(0);
-    LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
-#endif
-
-#ifdef GGML_USE_SYCL
-    new_clip->backend = ggml_backend_sycl_init(0);
-    LOG_INF("%s: CLIP using SYCL backend\n", __func__);
-#endif
+//#ifdef GGML_USE_CUDA
+//    new_clip->backend = ggml_backend_cuda_init(0);
+//    LOG_INF("%s: CLIP using CUDA backend\n", __func__);
+//#endif
+//
+//#ifdef GGML_USE_METAL
+//    new_clip->backend = ggml_backend_metal_init();
+//    LOG_INF("%s: CLIP using Metal backend\n", __func__);
+//#endif
+//
+//#ifdef GGML_USE_CANN
+//    new_clip->backend = ggml_backend_cann_init(0);
+//    LOG_INF("%s: CLIP using CANN backend\n", __func__);
+//#endif
+//
+//#ifdef GGML_USE_VULKAN
+//    new_clip->backend = ggml_backend_vk_init(0);
+//    LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
+//#endif
+//
+//#ifdef GGML_USE_SYCL
+//    new_clip->backend = ggml_backend_sycl_init(0);
+//    LOG_INF("%s: CLIP using SYCL backend\n", __func__);
+//#endif
 
     if (!new_clip->backend) {
         new_clip->backend = ggml_backend_cpu_init();
@@ -1206,6 +1275,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             new_clip->minicpmv_version = gguf_get_val_i32(ctx, idx);
         }
 
+        idx = gguf_find_key(ctx, KEY_HAS_QWEN2VL_MERGER);
+        if (idx != -1) {
+            new_clip->has_qwen2vl_merger = gguf_get_val_bool(ctx, idx);
+        }
         // GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
 
         GGML_ASSERT(new_clip->has_vision_encoder);
@@ -1214,6 +1287,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         idx = get_key_idx(ctx, KEY_USE_GELU);
         new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
 
+        try {
+            idx = get_key_idx(ctx, KEY_USE_SILU);
+            new_clip->use_silu = gguf_get_val_bool(ctx, idx);
+        } catch (std::runtime_error & /*e*/) {
+            new_clip->use_silu = false;
+        }
+
         if (verbosity >= 1) {
             LOG_INF("%s: text_encoder:   %d\n", __func__, new_clip->has_text_encoder);
             LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
@@ -1389,11 +1469,16 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         }
 
         try {
-            vision_model.patch_embeddings    = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
+            vision_model.patch_embeddings_0    = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
             vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
         } catch(const std::exception& /*e*/) {
             LOG_ERR("%s: failed to load vision model tensors\n", __func__);
         }
+        try {
+            vision_model.patch_embeddings_1    = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD_1);
+        } catch(const std::exception& /*e*/) {
+            new_clip->has_qwen2vl_merger = false;
+        }
 
         // LLaVA projection
         if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) {
@@ -1481,6 +1566,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             vision_model.mm_model_ln_post_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "weight"));
             vision_model.mm_model_ln_post_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "bias"));
         }
+        else if (new_clip->proj_type == PROJECTOR_TYPE_MERGER) {
+            vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
+            vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
+            vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
+            vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
+        }
         else {
             std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
             throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
@@ -1519,6 +1610,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend));
         clip_image_f32_batch batch;
         batch.size = 1;
+        batch.data = nullptr;
         ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
         ggml_gallocr_reserve(new_clip->compute_alloc, gf);
         size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
@@ -1532,6 +1624,10 @@ void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size
     ctx_clip->load_image_size = load_image_size;
 }
 
+struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip) {
+    return ctx_clip->load_image_size;
+}
+
 struct clip_image_size * clip_image_size_init() {
     struct clip_image_size * load_image_size = new struct clip_image_size();
     load_image_size->width = 448;
@@ -1984,6 +2080,23 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
         }
         return true;
     }
+    else if (ctx->has_qwen2vl_merger) {
+        clip_image_u8 * resized = clip_image_u8_init();
+        auto patch_size = clip_patch_size(ctx) * 2;
+        int nx = ceil((float)img->nx / patch_size) * patch_size;
+        int ny = ceil((float)img->ny / patch_size) * patch_size;
+        bicubic_resize(*img, *resized, nx, ny);
+
+        res_imgs->data = new clip_image_f32[1];
+        // clip_image_f32 * res = clip_image_f32_init();
+        normalize_image_u8_to_f32(resized, res_imgs->data, ctx->image_mean, ctx->image_std);
+        // res_imgs->data[0] = *res;
+        res_imgs->size = 1;
+
+        // clip_image_f32_free(res);
+        clip_image_u8_free(resized);
+        return true;
+    }
 
     bool pad_to_square = true;
     if (!ctx->has_vision_encoder) {
@@ -2173,6 +2286,13 @@ size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
     return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
 }
 
+size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) {
+    clip_image_f32 img;
+    img.nx = img_w;
+    img.ny = img_h;
+    return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
+}
+
 int32_t clip_image_size(const struct clip_ctx * ctx) {
     return ctx->vision_model.hparams.image_size;
 }
@@ -2194,6 +2314,13 @@ const int32_t * clip_image_grid(const struct clip_ctx * ctx) {
 }
 
 int clip_n_patches(const struct clip_ctx * ctx) {
+    clip_image_f32 img;
+    img.nx = ctx->vision_model.hparams.image_size;
+    img.ny = ctx->vision_model.hparams.image_size;
+    return clip_n_patches_by_img(ctx, &img);
+}
+
+int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
     const auto & params = ctx->vision_model.hparams;
 
     int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
@@ -2207,6 +2334,11 @@ int clip_n_patches(const struct clip_ctx * ctx) {
         else if (ctx->minicpmv_version == 3) {
             n_patches = 64;
         }
+    } else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
+        int patch_size = params.patch_size * 2;
+        int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
+        int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
+        n_patches = x_patch * y_patch;
     }
 
     return n_patches;
@@ -2335,7 +2467,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
     const int image_size = hparams.image_size;
     int image_size_width  = image_size;
     int image_size_height = image_size;
-    if (ctx->has_minicpmv_projector) {
+    if (ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger) {
         image_size_width  = imgs->data[0].nx;
         image_size_height = imgs->data[0].ny;
     }
@@ -2355,7 +2487,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
         for (size_t i = 0; i < imgs->size; i++) {
             const int nx = imgs->data[i].nx;
             const int ny = imgs->data[i].ny;
-            if (!ctx->has_minicpmv_projector) {
+            if (!(ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger)) {
                 GGML_ASSERT(nx == image_size && ny == image_size);
             }
 
@@ -2413,9 +2545,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
             auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
 
             float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed));
-            for(int i=0;i<pos_w * pos_h;++i){
-                for(int j=0;j<embed_dim;++j){
-                    pos_embed_data[i*embed_dim+j]=pos_embed_t[i][j];
+            for(int i=0;i < pos_w * pos_h; ++i){
+                for(int j=0; j < embed_dim; ++j){
+                    pos_embed_data[i * embed_dim + j] = pos_embed_t[i][j];
                 }
             }
 
@@ -2435,7 +2567,34 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
             }
         }
 
-        {
+        if (ctx->has_qwen2vl_merger) {
+            struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
+
+            const int pw = image_size_width / patch_size;
+            const int ph = image_size_height / patch_size;
+            int* positions_data = (int*)malloc(ggml_nbytes(positions));
+
+            int ptr = 0;
+            for (int y = 0; y < ph; y+=2)
+            {
+                for (int x = 0; x < pw; x+=2)
+                {
+                    for (int dy = 0; dy < 2; dy++) {
+                        for (int dx = 0; dx < 2; dx++) {
+                            positions_data[ptr]                 = y + dy;
+                            positions_data[num_patches + ptr]     = x + dx;
+                            positions_data[num_patches * 2 + ptr] = y + dy;
+                            positions_data[num_patches * 3 + ptr] = x + dx;
+                            ptr++;
+                        }
+                    }
+                }
+            }
+
+            ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
+            free(positions_data);
+        }
+        else {
             struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
 
             int* positions_data = (int*)malloc(ggml_nbytes(positions));
@@ -2444,16 +2603,16 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
             }
             ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
             free(positions_data);
-        }
 
-        {
-            struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
-            int* patches_data = (int*)malloc(ggml_nbytes(patches));
-            for (int i = 0; i < num_patches; i++) {
-                patches_data[i] = i + 1;
+            {
+                struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
+                int* patches_data = (int*)malloc(ggml_nbytes(patches));
+                for (int i = 0; i < num_patches; i++) {
+                    patches_data[i] = i + 1;
+                }
+                ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
+                free(patches_data);
             }
-            ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
-            free(patches_data);
         }
     }
 
@@ -2626,6 +2785,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
             return 3584;
         }
     }
+    if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
+        return ctx->vision_model.mm_1_b->ne[0];
+    }
 
     std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
     throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
@@ -2637,3 +2799,21 @@ int clip_is_minicpmv(const struct clip_ctx * ctx) {
     }
     return 0;
 }
+
+bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
+    return ctx->has_qwen2vl_merger;
+}
+
+
+bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
+    clip_image_f32 clip_img;
+    clip_img.buf.resize(h * w * 3);
+    for (int i = 0; i < h*w*3; i++)
+    {
+        clip_img.buf[i] = img[i];
+    }
+    clip_img.nx = w;
+    clip_img.ny = h;
+    clip_image_encode(ctx, n_threads, &clip_img, vec);
+    return true;
+}
diff --git a/examples/llava/clip.h b/examples/llava/clip.h
index 78588bdf174..1603edd265e 100644
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@@ -45,6 +45,7 @@ CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity
 CLIP_API void clip_free(struct clip_ctx * ctx);
 
 CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
+CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w);
 
 CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx);
 CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx);
@@ -55,11 +56,13 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
 
 CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
 
-CLIP_API int clip_n_patches    (const struct clip_ctx * ctx);
-CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
+CLIP_API int clip_n_patches        (const struct clip_ctx * ctx);
+CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img);
+CLIP_API int clip_n_mmproj_embd    (const struct clip_ctx * ctx);
 
 CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
 CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
+CLIP_API struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);
 
 CLIP_API struct clip_image_size * clip_image_size_init();
 CLIP_API struct clip_image_u8  * clip_image_u8_init ();
@@ -86,6 +89,9 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons
 CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
 
 CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
+CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
+
+CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
 
 #ifdef __cplusplus
 }
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index 4ca53a0b883..16f30c56cea 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -259,25 +259,33 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
 
     const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
 
-    if (clip_is_minicpmv(ctx_clip)) {
+    if (clip_is_minicpmv(ctx_clip) || clip_is_qwen2vl(ctx_clip)) {
         std::vector<float *> image_embd_v;
         image_embd_v.resize(img_res_v.size);
         struct clip_image_size * load_image_size = clip_image_size_init();
+
         for (size_t i = 0; i < img_res_v.size; i++) {
             const int64_t t_img_enc_step_start_us = ggml_time_us();
-            image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip));
+            image_embd_v[i] = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny));
             int patch_size=14;
             load_image_size->width = img_res_v.data[i].nx;
             load_image_size->height = img_res_v.data[i].ny;
             clip_add_load_image_size(ctx_clip, load_image_size);
+
             bool encoded = false;
-            int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
-            if (has_minicpmv_projector == 2) {
-                encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
-            }
-            else if (has_minicpmv_projector == 3) {
+            if (clip_is_qwen2vl(ctx_clip)) {
                 encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
             }
+            else {
+                int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
+                if (has_minicpmv_projector == 2) {
+                    encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
+                }
+                else if (has_minicpmv_projector == 3) {
+                    encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
+                }
+            }
+
             if (!encoded) {
                 LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
                 return false;
@@ -290,8 +298,11 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
 
         int n_img_pos_out = 0;
         for (size_t i = 0; i < image_embd_v.size(); i++) {
-            std::memcpy(image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), image_embd_v[i], clip_embd_nbytes(ctx_clip));
-            n_img_pos_out += clip_n_patches(ctx_clip);
+            std::memcpy(
+                image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),
+                image_embd_v[i],
+                clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny));
+            n_img_pos_out += clip_n_patches_by_img(ctx_clip, &img_res_v.data[i]);
         }
         *n_img_pos = n_img_pos_out;
         for (size_t i = 0; i < image_embd_v.size(); i++) {
@@ -387,7 +398,13 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
     if (clip_is_minicpmv(ctx_clip)) {
         num_max_patches = 10;
     }
-    float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
+    float * image_embd;
+    if (clip_is_qwen2vl(ctx_clip)) {
+        // qwen2vl don't split image into chunks, so `num_max_patches` is not needed.
+        image_embd = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, img->nx, img->ny));
+    } else {
+        image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
+    }
     if (!image_embd) {
         LOG_ERR("Unable to allocate memory for image embeddings\n");
         return false;
diff --git a/examples/llava/qwen2_vl_surgery.py b/examples/llava/qwen2_vl_surgery.py
new file mode 100644
index 00000000000..c87606b4fdf
--- /dev/null
+++ b/examples/llava/qwen2_vl_surgery.py
@@ -0,0 +1,165 @@
+import argparse
+from typing import Dict
+
+import torch
+import numpy as np
+from gguf import *
+from transformers import (
+    Qwen2VLForConditionalGeneration,
+    Qwen2VLProcessor,
+    AutoProcessor,
+    Qwen2VLConfig
+)
+
+
+VISION = "clip.vision"
+
+
+def k(raw_key: str, arch: str) -> str:
+    return raw_key.format(arch=arch)
+
+
+def to_gguf_name(name: str) -> str:
+    og = name
+    name = name.replace("text_model", "t").replace("vision_model", "v")
+    name = name.replace("blocks", "blk").replace("embeddings.", "")
+    name = name.replace("attn.", "attn_")
+    name = name.replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("proj.", "out.")
+    # name = name.replace("layrnorm", "ln").replace("layer_norm", "ln").replace("layernorm", "ln")
+    name = name.replace("norm1", "ln1").replace("norm2", "ln2")
+    name = name.replace("merger.mlp", 'mm')
+    print(f"[to_gguf_name] {og} --> {name}")
+    return name
+
+
+def find_vision_tensors(qwen2vl, dtype) -> Dict[str, np.ndarray]:
+    vision_model = qwen2vl.visual
+    tensor_map = {}
+    for name, ten in vision_model.state_dict().items():
+        ten = ten.numpy()
+        if 'qkv' in name:
+            if ten.ndim == 2: # weight
+                c3, _ = ten.shape
+            else:             # bias
+                c3 = ten.shape[0]
+            assert c3 % 3 == 0
+            c = c3 // 3
+            wq = ten[:c]
+            wk = ten[c: c * 2]
+            wv = ten[c * 2:]
+            tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "q")] = wq
+            tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "k")] = wk
+            tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "v")] = wv
+        elif 'merger' in name:
+            if name.endswith("ln_q.weight"):
+                tensor_map['v.post_ln.weight'] = ten
+            elif name.endswith("ln_q.bias"):
+                tensor_map['v.post_ln.bias'] = ten
+            else:
+                # "merger.mlp.%d.weight/bias" --> "mm.%d.weight/bias"
+                tensor_map[to_gguf_name(name)] = ten
+        elif 'patch_embed.proj.weight' in name:
+            # NOTE: split Conv3D into Conv2Ds
+            c1, c2, kt, kh, kw = ten.shape
+            assert kt == 2, "Current implmentation only support temporal_patch_size of 2"
+            tensor_map["v.patch_embd.weight"] = ten[:, :, 0, ...]
+            tensor_map["v.patch_embd.weight.1"] = ten[:, :, 1, ...]
+        else:
+            tensor_map[to_gguf_name(f"vision_model.{name}")] = ten
+
+    for new_name, ten in tensor_map.items():
+        if ten.ndim <= 1 or new_name.endswith("_norm.weight"):
+            tensor_map[new_name] = ten.astype(np.float32)
+        else:
+            tensor_map[new_name] = ten.astype(dtype)
+    tensor_map["v.position_embd.weight"] = np.zeros([10, 10], dtype=np.float32)  # dummy tensor, just here as a placeholder
+    return tensor_map
+
+
+def main(args):
+    if args.data_type == 'fp32':
+        dtype = torch.float32
+        np_dtype = np.float32
+        ftype = 0
+    elif args.data_type == 'fp16':
+        dtype = torch.float32
+        np_dtype = np.float16
+        ftype = 1
+    else:
+        raise ValueError()
+
+    local_model = False
+    model_path = ""
+    model_name = args.model_name
+    print("model_name: ", model_name)
+    qwen2vl = Qwen2VLForConditionalGeneration.from_pretrained(
+        model_name, torch_dtype=dtype, device_map="cpu"
+    )
+    cfg: Qwen2VLConfig = qwen2vl.config  # type: ignore[reportAssignmentType]
+    vcfg = cfg.vision_config
+
+    if os.path.isdir(model_name):
+        local_model = True
+        if model_name.endswith(os.sep):
+            model_name = model_name[:-1]
+        model_path = model_name
+        model_name = os.path.basename(model_name)
+    fname_out = f"{model_name.replace('/', '-').lower()}-vision.gguf"
+
+    fout = GGUFWriter(path=fname_out, arch="clip")
+    fout.add_description("image encoder for Qwen2VL")
+
+    fout.add_file_type(ftype)
+    fout.add_bool("clip.has_text_encoder", False)
+    fout.add_bool("clip.has_vision_encoder", True)
+    fout.add_bool("clip.has_qwen2vl_merger", True)
+    fout.add_string("clip.projector_type", "qwen2vl_merger")
+
+    print(cfg.vision_config)
+    if 'silu' in cfg.vision_config.hidden_act.lower():
+        fout.add_bool("clip.use_silu", True)
+        fout.add_bool("clip.use_gelu", False)
+    elif 'gelu' in cfg.vision_config.hidden_act.lower():
+        fout.add_bool("clip.use_silu", False)
+        fout.add_bool("clip.use_gelu", 'quick' not in cfg.vision_config.hidden_act.lower())
+    else:
+        raise ValueError()
+
+    tensor_map = find_vision_tensors(qwen2vl, np_dtype)
+    for name, data in tensor_map.items():
+        fout.add_tensor(name, data)
+
+    fout.add_uint32("clip.vision.patch_size", vcfg.patch_size)
+    fout.add_uint32("clip.vision.image_size", 14 * 40)  # some reasonable size that is divable by (14*2)
+    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.embed_dim)
+    fout.add_uint32("clip.vision.projection_dim", vcfg.hidden_size)
+    fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), vcfg.num_heads)
+    fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
+    fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), vcfg.depth)
+    fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), 0)  # not sure what this does, put 0 here as a placeholder
+    fout.add_name(model_name)
+    """
+    HACK: Since vision rope related parameter aren't stored in the `Qwen2VLConfig,
+            it will be hardcoded in the `clip_image_build_graph` from `clip.cpp`.
+    """
+
+    if local_model:
+        processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_path)
+    else:
+        processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_name)
+    fout.add_array("clip.vision.image_mean", processor.image_processor.image_mean) # type: ignore[reportAttributeAccessIssue]
+    fout.add_array("clip.vision.image_std", processor.image_processor.image_std) # type: ignore[reportAttributeAccessIssue]
+
+    fout.write_header_to_file()
+    fout.write_kv_data_to_file()
+    fout.write_tensors_to_file()
+    fout.close()
+    print("save model as: ", fname_out)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("model_name", nargs='?', default="Qwen/Qwen2-VL-2B-Instruct")
+    parser.add_argument("--data_type", nargs='?', choices=['fp32', 'fp16'], default="fp32")
+    args = parser.parse_args()
+    main(args)
diff --git a/examples/llava/qwen2vl-cli.cpp b/examples/llava/qwen2vl-cli.cpp
new file mode 100644
index 00000000000..e86a60280ae
--- /dev/null
+++ b/examples/llava/qwen2vl-cli.cpp
@@ -0,0 +1,581 @@
+#include "arg.h"
+#include "base64.hpp"
+#include "log.h"
+#include "common.h"
+#include "sampling.h"
+#include "clip.h"
+#include "llava.h"
+#include "llama.h"
+#include "ggml.h"
+
+#ifdef GGML_USE_CUDA
+#include "ggml-cuda.h"
+#endif
+#ifdef NDEBUG
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+#endif
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <vector>
+#include <algorithm>
+#include <iostream>
+#include <fstream>
+
+
+static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed,
+                                     int n_batch, int * n_past, int * st_pos_id, struct clip_image_size * image_size) {
+    int n_embd  = llama_n_embd(llama_get_model(ctx_llama));
+    const int patch_size = 14 * 2;
+    const int ph = image_size->height / patch_size + (image_size->height % patch_size > 0);
+    const int pw = image_size->width / patch_size + (image_size->width % patch_size > 0);
+    auto img_tokens = image_embed->n_image_pos;
+    // llama_pos mrope_pos[img_tokens * 4];
+    std::vector<llama_pos> mrope_pos;
+    mrope_pos.resize(img_tokens * 4);
+
+    for (int y = 0; y < ph; y++)
+    {
+        for (int x = 0; x < pw; x++)
+        {
+            int i = y * pw + x;
+            mrope_pos[i] = *st_pos_id;
+            mrope_pos[i + img_tokens] = *st_pos_id + y;
+            mrope_pos[i + img_tokens * 2] = *st_pos_id + x;
+            mrope_pos[i + img_tokens * 3] = 0;
+        }
+    }
+    *st_pos_id += std::max(pw, ph);
+
+    int processed = 0;
+    std::vector<llama_pos> batch_mrope_pos;
+    batch_mrope_pos.resize(img_tokens * 4);
+
+    for (int i = 0; i < img_tokens; i += n_batch) {
+        int n_eval = img_tokens - i;
+        if (n_eval > n_batch) {
+            n_eval = n_batch;
+        }
+
+        // llama_pos batch_mrope_pos[n_eval * 4];
+        std::fill(batch_mrope_pos.begin(), batch_mrope_pos.end(), 0);
+        memcpy(batch_mrope_pos.data(), &mrope_pos[processed], n_eval * sizeof(llama_pos));
+        memcpy(&batch_mrope_pos[n_eval * 1], &mrope_pos[img_tokens * 1 + processed], n_eval * sizeof(llama_pos));
+        memcpy(&batch_mrope_pos[n_eval * 2], &mrope_pos[img_tokens * 2 + processed], n_eval * sizeof(llama_pos));
+        memcpy(&batch_mrope_pos[n_eval * 3], &mrope_pos[img_tokens * 3 + processed], n_eval * sizeof(llama_pos));
+
+        llama_batch batch = {
+            int32_t(n_eval),                // n_tokens
+            nullptr,                        // token
+            (image_embed->embed+i*n_embd),  // embed
+            batch_mrope_pos.data(),         // pos
+            nullptr,  // n_seq_id
+            nullptr,  // seq_id
+            nullptr,  // logits
+        };
+
+        if (llama_decode(ctx_llama, batch)) {
+            LOG_ERR("%s : failed to eval\n", __func__);
+            return false;
+        }
+        *n_past += n_eval;
+        processed += n_eval;
+    }
+    return true;
+}
+
+
+static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past, int * st_pos_id) {
+    int N = (int) tokens.size();
+    std::vector<llama_pos> pos;
+    for (int i = 0; i < N; i += n_batch) {
+        int n_eval = (int) tokens.size() - i;
+        if (n_eval > n_batch) {
+            n_eval = n_batch;
+        }
+        auto batch = llama_batch_get_one(&tokens[i], n_eval);
+        // TODO: add mrope pos ids somewhere else
+        pos.resize(batch.n_tokens * 4);
+        std::fill(pos.begin(), pos.end(), 0);
+        for (int j = 0; j < batch.n_tokens * 3; j ++) {
+            pos[j] = *st_pos_id + (j % batch.n_tokens);
+        }
+        batch.pos = pos.data();
+
+        if (llama_decode(ctx_llama, batch)) {
+            LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
+            return false;
+        }
+        *n_past += n_eval;
+        *st_pos_id += n_eval;
+    }
+    return true;
+}
+
+static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past, int * st_pos_id) {
+    std::vector<llama_token> tokens;
+    tokens.push_back(id);
+    return eval_tokens(ctx_llama, tokens, 1, n_past, st_pos_id);
+}
+
+static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, int * st_pos_id, bool add_bos){
+    std::string              str2     = str;
+    std::vector<llama_token> embd_inp = common_tokenize(ctx_llama, str2, add_bos, true);
+    eval_tokens(ctx_llama, embd_inp, n_batch, n_past, st_pos_id);
+    return true;
+}
+
+static const char * sample(struct common_sampler * smpl,
+                           struct llama_context * ctx_llama,
+                           int * n_past, int * st_pos_id) {
+    const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
+    common_sampler_accept(smpl, id, true);
+    static std::string ret;
+    if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
+        ret = "</s>";
+    } else {
+        ret = common_token_to_piece(ctx_llama, id);
+    }
+    eval_id(ctx_llama, id, n_past, st_pos_id);
+    return ret.c_str();
+}
+
+static const char* IMG_BASE64_TAG_BEGIN = "<img src=\"data:image/jpeg;base64,";
+static const char* IMG_BASE64_TAG_END = "\">";
+
+static void find_image_tag_in_prompt(const std::string& prompt, size_t& begin_out, size_t& end_out) {
+    begin_out = prompt.find(IMG_BASE64_TAG_BEGIN);
+    end_out = prompt.find(IMG_BASE64_TAG_END, (begin_out == std::string::npos) ? 0UL : begin_out);
+}
+
+static bool prompt_contains_image(const std::string& prompt) {
+    size_t begin, end;
+    find_image_tag_in_prompt(prompt, begin, end);
+    return (begin != std::string::npos);
+}
+
+// replaces the base64 image tag in the prompt with `replacement`
+static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip_ctx * ctx_clip, int n_threads, const std::string& prompt) {
+    size_t img_base64_str_start, img_base64_str_end;
+    find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
+    if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
+        LOG_ERR("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
+        return NULL;
+    }
+
+    auto base64_bytes_start = img_base64_str_start + strlen(IMG_BASE64_TAG_BEGIN);
+    auto base64_bytes_count = img_base64_str_end - base64_bytes_start;
+    auto base64_str = prompt.substr(base64_bytes_start, base64_bytes_count );
+
+    auto required_bytes = base64::required_encode_size(base64_str.size());
+    auto img_bytes = std::vector<unsigned char>(required_bytes);
+    base64::decode(base64_str.begin(), base64_str.end(), img_bytes.begin());
+
+    auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
+    if (!embed) {
+        LOG_ERR("%s: could not load image from base64 string.\n", __func__);
+        return NULL;
+    }
+
+    return embed;
+}
+
+static std::string remove_image_from_prompt(const std::string& prompt, const char * replacement = "") {
+    size_t begin, end;
+    find_image_tag_in_prompt(prompt, begin, end);
+    if (begin == std::string::npos || end == std::string::npos) {
+        return prompt;
+    }
+    auto pre = prompt.substr(0, begin);
+    auto post = prompt.substr(end + strlen(IMG_BASE64_TAG_END));
+    return pre + replacement + post;
+}
+
+struct llava_context {
+    struct clip_ctx * ctx_clip = NULL;
+    struct llama_context * ctx_llama = NULL;
+    struct llama_model * model = NULL;
+};
+
+static void print_usage(int, char ** argv) {
+    LOG("\n example usage:\n");
+    LOG("\n     %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
+    LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
+}
+
+static struct llava_image_embed * load_image(llava_context * ctx_llava, common_params * params, const std::string & fname) {
+
+    // load and preprocess the image
+    llava_image_embed * embed = NULL;
+    auto prompt = params->prompt;
+    if (prompt_contains_image(prompt)) {
+        if (!params->image.empty()) {
+            LOG_INF("using base64 encoded image instead of command line image path\n");
+        }
+        embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
+        if (!embed) {
+            LOG_ERR("%s: can't load image from prompt\n", __func__);
+            return NULL;
+        }
+        params->prompt = remove_image_from_prompt(prompt);
+    } else {
+        embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str());
+        if (!embed) {
+            fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
+            return NULL;
+        }
+    }
+
+    return embed;
+}
+
+static void process_prompt(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, common_params * params, const std::string & prompt) {
+    int n_past = 0;
+    int cur_pos_id = 0;
+
+    const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
+
+    std::string system_prompt, user_prompt;
+    size_t image_pos = prompt.find("<|vision_start|>");
+    if (image_pos != std::string::npos) {
+        // new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
+        system_prompt = prompt.substr(0, image_pos);
+        user_prompt = prompt.substr(image_pos + std::string("<|vision_pad|>").length());
+        LOG_INF("system_prompt: %s\n", system_prompt.c_str());
+        if (params->verbose_prompt) {
+            auto tmp = common_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
+            for (int i = 0; i < (int) tmp.size(); i++) {
+                LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+            }
+        }
+        LOG_INF("user_prompt: %s\n", user_prompt.c_str());
+        if (params->verbose_prompt) {
+            auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
+            for (int i = 0; i < (int) tmp.size(); i++) {
+                LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+            }
+        }
+    } else {
+        // llava-1.5 native mode
+        system_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|>";
+        user_prompt = "<|vision_end|>" + prompt + "<|im_end|>\n<|im_start|>assistant\n";
+        if (params->verbose_prompt) {
+            auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
+            for (int i = 0; i < (int) tmp.size(); i++) {
+                LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+            }
+        }
+    }
+
+    eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, &cur_pos_id, true);
+    if (image_embed != nullptr) {
+        auto image_size = clip_get_load_image_size(ctx_llava->ctx_clip);
+        qwen2vl_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past, &cur_pos_id, image_size);
+    }
+    eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, &cur_pos_id, false);
+
+    // generate the response
+
+    LOG("\n");
+
+    struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling);
+    if (!smpl) {
+        LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
+        exit(1);
+    }
+
+    std::string response = "";
+    for (int i = 0; i < max_tgt_len; i++) {
+        const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past, &cur_pos_id);
+        response += tmp;
+        if (strcmp(tmp, "</s>") == 0) break;
+        if (strstr(tmp, "###")) break; // Yi-VL behavior
+        LOG("%s", tmp);
+        if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
+        if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
+        if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
+
+        fflush(stdout);
+    }
+
+    common_sampler_free(smpl);
+    LOG("\n");
+}
+
+static struct llama_model * llava_init(common_params * params) {
+    llama_backend_init();
+    llama_numa_init(params->numa);
+
+    llama_model_params model_params = common_model_params_to_llama(*params);
+
+    llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
+    if (model == NULL) {
+        LOG_ERR("%s: unable to load model\n" , __func__);
+        return NULL;
+    }
+    return model;
+}
+
+static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
+    const char * clip_path = params->mmproj.c_str();
+
+    auto prompt = params->prompt;
+    if (prompt.empty()) {
+        prompt = "describe the image in detail.";
+    }
+
+    auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
+
+
+    llama_context_params ctx_params = common_context_params_to_llama(*params);
+    ctx_params.n_ctx           = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
+
+    llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
+
+    if (ctx_llama == NULL) {
+        LOG_ERR("%s: failed to create the llama_context\n" , __func__);
+        return NULL;
+    }
+
+    auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
+
+    ctx_llava->ctx_llama = ctx_llama;
+    ctx_llava->ctx_clip = ctx_clip;
+    ctx_llava->model = model;
+    return ctx_llava;
+}
+
+static void llava_free(struct llava_context * ctx_llava) {
+    if (ctx_llava->ctx_clip) {
+        clip_free(ctx_llava->ctx_clip);
+        ctx_llava->ctx_clip = NULL;
+    }
+
+    llama_free(ctx_llava->ctx_llama);
+    llama_free_model(ctx_llava->model);
+    llama_backend_free();
+}
+
+#ifndef NDEBUG
+
+static void debug_test_mrope_2d() {
+    // 1. Initialize backend
+    ggml_backend_t backend = NULL;
+    std::string backend_name = "";
+#ifdef GGML_USE_CUDA
+    fprintf(stderr, "%s: using CUDA backend\n", __func__);
+    backend = ggml_backend_cuda_init(0); // init device 0
+    backend_name = "cuda";
+    if (!backend) {
+        fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
+    }
+#endif
+    // if there aren't GPU Backends fallback to CPU backend
+    if (!backend) {
+        backend = ggml_backend_cpu_init();
+        backend_name = "cpu";
+    }
+
+    // Calculate the size needed to allocate
+    size_t ctx_size = 0;
+    ctx_size += 2 * ggml_tensor_overhead(); // tensors
+    // no need to allocate anything else!
+
+    // 2. Allocate `ggml_context` to store tensor data
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ ctx_size,
+        /*.mem_buffer =*/ NULL,
+        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_backend_alloc_ctx_tensors()
+    };
+    struct ggml_context * ctx = ggml_init(params);
+
+    struct ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 128, 12, 30);
+    ggml_set_name(inp_raw, "inp_raw");
+    ggml_set_input(inp_raw);
+
+    struct ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 30 * 4);
+    ggml_set_name(pos, "pos");
+    ggml_set_input(pos);
+
+    std::vector<float> dummy_q;
+    dummy_q.resize(128 * 12 * 30);
+    std::fill(dummy_q.begin(), dummy_q.end(), 0.1);
+    // memcpy(inp_raw->data, dummy_q.data(), 128 * 12 * 30 * ggml_element_size(inp_raw));
+
+    std::vector<int> pos_id;
+    pos_id.resize(30 * 4);
+    for (int i = 0; i < 30; i ++) {
+        pos_id[i] = i;
+        pos_id[i + 30] = i + 10;
+        pos_id[i + 60] = i + 20;
+        pos_id[i + 90] = i + 30;
+    }
+    int sections[4] = {32, 32, 0, 0};
+
+    // 4. Allocate a `ggml_backend_buffer` to store all tensors
+    ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
+
+    // 5. Copy tensor data from main memory (RAM) to backend buffer
+    ggml_backend_tensor_set(inp_raw, dummy_q.data(), 0, ggml_nbytes(inp_raw));
+    ggml_backend_tensor_set(pos, pos_id.data(), 0, ggml_nbytes(pos));
+
+    // 6. Create a `ggml_cgraph` for mul_mat operation
+    struct ggml_cgraph * gf = NULL;
+    struct ggml_context * ctx_cgraph = NULL;
+
+    // create a temporally context to build the graph
+    struct ggml_init_params params0 = {
+        /*.mem_size   =*/ ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(),
+        /*.mem_buffer =*/ NULL,
+        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
+    };
+    ctx_cgraph = ggml_init(params0);
+    gf = ggml_new_graph(ctx_cgraph);
+
+    struct ggml_tensor * result0 = ggml_rope_multi(
+        ctx_cgraph, inp_raw, pos, nullptr,
+        128/2, sections, LLAMA_ROPE_TYPE_VISION, 32768, 1000000, 1,
+        0, 1, 32, 1);
+
+    // Add "result" tensor and all of its dependencies to the cgraph
+    ggml_build_forward_expand(gf, result0);
+
+    // 7. Create a `ggml_gallocr` for cgraph computation
+    ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
+    ggml_gallocr_alloc_graph(allocr, gf);
+
+    // 9. Run the computation
+    int n_threads = 1; // Optional: number of threads to perform some operations with multi-threading
+    if (ggml_backend_is_cpu(backend)) {
+        ggml_backend_cpu_set_n_threads(backend, n_threads);
+    }
+    ggml_backend_graph_compute(backend, gf);
+
+    // 10. Retrieve results (output tensors)
+    // in this example, output tensor is always the last tensor in the graph
+    struct ggml_tensor * result = result0;
+    // struct ggml_tensor * result = gf->nodes[gf->n_nodes - 1];
+    float * result_data = (float *)malloc(ggml_nbytes(result));
+    // because the tensor data is stored in device buffer, we need to copy it back to RAM
+    ggml_backend_tensor_get(result, result_data, 0, ggml_nbytes(result));
+    const std::string bin_file = "mrope_2d_" + backend_name +".bin";
+    std::ofstream outFile(bin_file, std::ios::binary);
+
+    if (outFile.is_open()) {
+        outFile.write(reinterpret_cast<const char*>(result_data), ggml_nbytes(result));
+        outFile.close();
+        std::cout << "Data successfully written to " + bin_file << std::endl;
+    } else {
+        std::cerr << "Error opening file!" << std::endl;
+    }
+
+    free(result_data);
+    // 11. Free memory and exit
+    ggml_free(ctx_cgraph);
+    ggml_gallocr_free(allocr);
+    ggml_free(ctx);
+    ggml_backend_buffer_free(buffer);
+    ggml_backend_free(backend);
+}
+
+static void debug_dump_img_embed(struct llava_context * ctx_llava) {
+    int n_embd  = llama_n_embd(llama_get_model(ctx_llava->ctx_llama));
+    int ne = n_embd * 4;
+    float vals[56 * 56 * 3];
+    // float embd[ne];
+    std::vector<float> embd;
+    embd.resize(ne);
+
+    for (int i = 0; i < 56*56; i++)
+    {
+        for (int c = 0; c < 3; c++)
+            vals[i * 3 + c] = (float)(i % (56 * 56)) / (56*56);
+    }
+
+    clip_encode_float_image(ctx_llava->ctx_clip, 16, vals, 56, 56, embd.data());
+
+    std::ofstream outFile("img_embed.bin", std::ios::binary);
+    if (outFile.is_open()) {
+        outFile.write(reinterpret_cast<const char*>(embd.data()), ne * sizeof(float));
+
+        outFile.close();
+        std::cout << "Data successfully written to mrope.bin" << std::endl;
+    } else {
+        std::cerr << "Error opening file!" << std::endl;
+    }
+}
+
+#endif
+
+
+int main(int argc, char ** argv) {
+    ggml_time_init();
+
+    common_params params;
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
+        return 1;
+    }
+
+    common_init();
+
+    if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
+        print_usage(argc, argv);
+        return 1;
+    }
+
+    auto * model = llava_init(&params);
+    if (model == NULL) {
+        fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
+        return 1;
+    }
+
+    if (prompt_contains_image(params.prompt)) {
+        auto * ctx_llava = llava_init_context(&params, model);
+
+        auto * image_embed = load_image(ctx_llava, &params, "");
+
+        // process the prompt
+        process_prompt(ctx_llava, image_embed, &params, params.prompt);
+
+        llama_perf_context_print(ctx_llava->ctx_llama);
+        llava_image_embed_free(image_embed);
+        ctx_llava->model = NULL;
+        llava_free(ctx_llava);
+#ifndef NDEBUG
+    } else if (params.image[0].empty()) {
+        auto ctx_llava = llava_init_context(&params, model);
+
+        debug_test_mrope_2d();
+        debug_dump_img_embed(ctx_llava);
+
+        llama_perf_context_print(ctx_llava->ctx_llama);
+        ctx_llava->model = NULL;
+        llava_free(ctx_llava);
+#endif
+    } else {
+        for (auto & image : params.image) {
+            auto * ctx_llava = llava_init_context(&params, model);
+
+            auto * image_embed = load_image(ctx_llava, &params, image);
+            if (!image_embed) {
+                LOG_ERR("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str());
+                return 1;
+            }
+
+            // process the prompt
+            process_prompt(ctx_llava, image_embed, &params, params.prompt);
+
+            llama_perf_context_print(ctx_llava->ctx_llama);
+            llava_image_embed_free(image_embed);
+            ctx_llava->model = NULL;
+            llava_free(ctx_llava);
+        }
+    }
+
+    llama_free_model(model);
+
+    return 0;
+}
diff --git a/examples/main/README.md b/examples/main/README.md
index 7787f7b11b8..17d80a622a8 100644
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -177,16 +177,11 @@ Example usage: `--temp 0`
 
 -   `--repeat-penalty N`: Control the repetition of token sequences in the generated text default: 1.0, 1.0 = disabled).
 -   `--repeat-last-n N`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size).
--   `--no-penalize-nl`: Disable penalization for newline tokens when applying the repeat penalty.
 
 The `repeat-penalty` option helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1.
 
 The `repeat-last-n` option controls the number of tokens in the history to consider for penalizing repetition. A larger value will look further back in the generated text to prevent repetitions, while a smaller value will only consider recent tokens. A value of 0 disables the penalty, and a value of -1 sets the number of tokens considered equal to the context size (`ctx-size`).
 
-Use the `--no-penalize-nl` option to disable newline penalization when applying the repeat penalty. This option is particularly useful for generating chat conversations, dialogues, code, poetry, or any text where newline tokens play a significant role in structure and formatting. Disabling newline penalization helps maintain the natural flow and intended formatting in these specific use cases.
-
-Example usage: `--repeat-penalty 1.15 --repeat-last-n 128 --no-penalize-nl`
-
 ### DRY Repetition Penalty
 
 DRY (Don't Repeat Yourself) sampling is an effective technique for reducing repetition in generated text even across long contexts by penalizing tokens based on their recent usage patterns (original [PR link](https://github.com/oobabooga/text-generation-webui/pull/5677)).
diff --git a/examples/quantize/README.md b/examples/quantize/README.md
index 5d1e11c67b1..f9cce7b2133 100644
--- a/examples/quantize/README.md
+++ b/examples/quantize/README.md
@@ -81,7 +81,7 @@ Several quantization methods are supported. They differ in the resulting model d
   - [#4930 - imatrix for all k-quants](https://github.com/ggerganov/llama.cpp/pull/4930)
   - [#4951 - imatrix on the GPU](https://github.com/ggerganov/llama.cpp/pull/4957)
   - [#4969 - imatrix for legacy quants](https://github.com/ggerganov/llama.cpp/pull/4969)
-  - [#4996 - k-qunats tuning](https://github.com/ggerganov/llama.cpp/pull/4996)
+  - [#4996 - k-quants tuning](https://github.com/ggerganov/llama.cpp/pull/4996)
   - [#5060 - Q3_K_XS](https://github.com/ggerganov/llama.cpp/pull/5060)
   - [#5196 - 3-bit i-quants](https://github.com/ggerganov/llama.cpp/pull/5196)
   - [quantization tuning](https://github.com/ggerganov/llama.cpp/pull/5320), [another one](https://github.com/ggerganov/llama.cpp/pull/5334), and [another one](https://github.com/ggerganov/llama.cpp/pull/5361)
diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp
index e78a8596d8c..a5c6fe7e585 100644
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@@ -107,7 +107,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
         }
 
         float * out = output + batch.seq_id[i][0] * n_embd;
-        common_embd_normalize(embd, out, n_embd);
+        common_embd_normalize(embd, out, n_embd, 2);
     }
 }
 
@@ -143,7 +143,7 @@ int main(int argc, char ** argv) {
         std::vector<chunk> file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator);
         chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end());
     }
-    LOG_INF("Number of chunks: %ld\n", chunks.size());
+    LOG_INF("Number of chunks: %zu\n", chunks.size());
 
     llama_backend_init();
     llama_numa_init(params.numa);
diff --git a/examples/rpc/rpc-server.cpp b/examples/rpc/rpc-server.cpp
index 5fe70dac7f1..8b1b23edad1 100644
--- a/examples/rpc/rpc-server.cpp
+++ b/examples/rpc/rpc-server.cpp
@@ -12,6 +12,10 @@
 #include "ggml-vulkan.h"
 #endif
 
+#ifdef GGML_USE_SYCL
+#include "ggml-sycl.h"
+#endif
+
 #include "ggml-rpc.h"
 #ifdef _WIN32
 #  include <windows.h>
@@ -91,6 +95,12 @@ static ggml_backend_t create_backend() {
     if (!backend) {
         fprintf(stderr, "%s: ggml_backend_vulkan_init() failed\n", __func__);
     }
+#elif GGML_USE_SYCL
+    fprintf(stderr, "%s: using SYCL backend\n", __func__);
+    backend = ggml_backend_sycl_init(0); // init device 0
+    if (!backend) {
+        fprintf(stderr, "%s: ggml_backend_sycl_init() failed\n", __func__);
+    }
 #endif
 
     // if there aren't GPU Backends fallback to CPU backend
@@ -106,6 +116,8 @@ static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
     ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
 #elif GGML_USE_VULKAN
     ggml_backend_vk_get_device_memory(0, free_mem, total_mem);
+#elif GGML_USE_SYCL
+    ggml_backend_sycl_get_device_memory(0, free_mem, total_mem);
 #else
     #ifdef _WIN32
         MEMORYSTATUSEX status;
diff --git a/examples/run/CMakeLists.txt b/examples/run/CMakeLists.txt
index 52add51ef77..0686d630570 100644
--- a/examples/run/CMakeLists.txt
+++ b/examples/run/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-run)
 add_executable(${TARGET} run.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/run/README.md b/examples/run/README.md
index 6e926811f3c..a0680544120 100644
--- a/examples/run/README.md
+++ b/examples/run/README.md
@@ -3,5 +3,49 @@
 The purpose of this example is to demonstrate a minimal usage of llama.cpp for running models.
 
 ```bash
-./llama-run Meta-Llama-3.1-8B-Instruct.gguf
-...
+llama-run granite-code
+```
+
+```bash
+llama-run -h
+Description:
+  Runs a llm
+
+Usage:
+  llama-run [options] model [prompt]
+
+Options:
+  -c, --context-size <value>
+      Context size (default: 2048)
+  -n, --ngl <value>
+      Number of GPU layers (default: 0)
+  --temp <value>
+      Temperature (default: 0.8)
+  -v, --verbose, --log-verbose
+      Set verbosity level to infinity (i.e. log all messages, useful for debugging)
+  -h, --help
+      Show help message
+
+Commands:
+  model
+      Model is a string with an optional prefix of
+      huggingface:// (hf://), ollama://, https:// or file://.
+      If no protocol is specified and a file exists in the specified
+      path, file:// is assumed, otherwise if a file does not exist in
+      the specified path, ollama:// is assumed. Models that are being
+      pulled are downloaded with .partial extension while being
+      downloaded and then renamed as the file without the .partial
+      extension when complete.
+
+Examples:
+  llama-run llama3
+  llama-run ollama://granite-code
+  llama-run ollama://smollm:135m
+  llama-run hf://QuantFactory/SmolLM-135M-GGUF/SmolLM-135M.Q2_K.gguf
+  llama-run huggingface://bartowski/SmolLM-1.7B-Instruct-v0.2-GGUF/SmolLM-1.7B-Instruct-v0.2-IQ3_M.gguf
+  llama-run https://example.com/some-file1.gguf
+  llama-run some-file2.gguf
+  llama-run file://some-file3.gguf
+  llama-run --ngl 999 some-file4.gguf
+  llama-run --ngl 999 some-file5.gguf Hello World
+```
diff --git a/examples/run/run.cpp b/examples/run/run.cpp
index cac2faefcc2..f89d041c44a 100644
--- a/examples/run/run.cpp
+++ b/examples/run/run.cpp
@@ -1,192 +1,714 @@
 #if defined(_WIN32)
-#include <windows.h>
+#    include <windows.h>
 #else
-#include <unistd.h>
+#    include <sys/file.h>
+#    include <sys/ioctl.h>
+#    include <unistd.h>
+#endif
+
+#if defined(LLAMA_USE_CURL)
+#    include <curl/curl.h>
 #endif
 
 #include <climits>
+#include <cstdarg>
 #include <cstdio>
 #include <cstring>
+#include <filesystem>
 #include <iostream>
 #include <sstream>
 #include <string>
-#include <unordered_map>
 #include <vector>
 
+#include "common.h"
+#include "json.hpp"
 #include "llama-cpp.h"
 
-typedef std::unique_ptr<char[]> char_array_ptr;
+GGML_ATTRIBUTE_FORMAT(1, 2)
+static std::string fmt(const char * fmt, ...) {
+    va_list ap;
+    va_list ap2;
+    va_start(ap, fmt);
+    va_copy(ap2, ap);
+    const int size = vsnprintf(NULL, 0, fmt, ap);
+    GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
+    std::string buf;
+    buf.resize(size);
+    const int size2 = vsnprintf(const_cast<char *>(buf.data()), buf.size() + 1, fmt, ap2);
+    GGML_ASSERT(size2 == size);
+    va_end(ap2);
+    va_end(ap);
+
+    return buf;
+}
 
-struct Argument {
-    std::string flag;
-    std::string help_text;
-};
+GGML_ATTRIBUTE_FORMAT(1, 2)
+static int printe(const char * fmt, ...) {
+    va_list args;
+    va_start(args, fmt);
+    const int ret = vfprintf(stderr, fmt, args);
+    va_end(args);
 
-struct Options {
-    std::string model_path, prompt_non_interactive;
-    int ngl = 99;
-    int n_ctx = 2048;
-};
+    return ret;
+}
 
-class ArgumentParser {
-   public:
-    ArgumentParser(const char * program_name) : program_name(program_name) {}
+class Opt {
+  public:
+    int init(int argc, const char ** argv) {
+        ctx_params           = llama_context_default_params();
+        model_params         = llama_model_default_params();
+        context_size_default = ctx_params.n_batch;
+        ngl_default          = model_params.n_gpu_layers;
+        common_params_sampling sampling;
+        temperature_default = sampling.temp;
+
+        if (argc < 2) {
+            printe("Error: No arguments provided.\n");
+            print_help();
+            return 1;
+        }
 
-    void add_argument(const std::string & flag, std::string & var, const std::string & help_text = "") {
-        string_args[flag] = &var;
-        arguments.push_back({flag, help_text});
+        // Parse arguments
+        if (parse(argc, argv)) {
+            printe("Error: Failed to parse arguments.\n");
+            print_help();
+            return 1;
+        }
+
+        // If help is requested, show help and exit
+        if (help) {
+            print_help();
+            return 2;
+        }
+
+        ctx_params.n_batch        = context_size >= 0 ? context_size : context_size_default;
+        model_params.n_gpu_layers = ngl >= 0 ? ngl : ngl_default;
+        temperature               = temperature >= 0 ? temperature : temperature_default;
+
+        return 0;  // Success
     }
 
-    void add_argument(const std::string & flag, int & var, const std::string & help_text = "") {
-        int_args[flag] = &var;
-        arguments.push_back({flag, help_text});
+    llama_context_params ctx_params;
+    llama_model_params   model_params;
+    std::string model_;
+    std::string          user;
+    int                  context_size = -1, ngl = -1;
+    float                temperature = -1;
+    bool                 verbose     = false;
+
+  private:
+    int   context_size_default = -1, ngl_default = -1;
+    float temperature_default = -1;
+    bool  help                = false;
+
+    bool parse_flag(const char ** argv, int i, const char * short_opt, const char * long_opt) {
+        return strcmp(argv[i], short_opt) == 0 || strcmp(argv[i], long_opt) == 0;
+    }
+
+    int handle_option_with_value(int argc, const char ** argv, int & i, int & option_value) {
+        if (i + 1 >= argc) {
+            return 1;
+        }
+
+        option_value = std::atoi(argv[++i]);
+
+        return 0;
+    }
+
+    int handle_option_with_value(int argc, const char ** argv, int & i, float & option_value) {
+        if (i + 1 >= argc) {
+            return 1;
+        }
+
+        option_value = std::atof(argv[++i]);
+
+        return 0;
     }
 
     int parse(int argc, const char ** argv) {
-        for (int i = 1; i < argc; ++i) {
-            std::string arg = argv[i];
-            if (string_args.count(arg)) {
-                if (i + 1 < argc) {
-                    *string_args[arg] = argv[++i];
-                } else {
-                    fprintf(stderr, "error: missing value for %s\n", arg.c_str());
-                    print_usage();
+        bool options_parsing   = true;
+        for (int i = 1, positional_args_i = 0; i < argc; ++i) {
+            if (options_parsing && (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0)) {
+                if (handle_option_with_value(argc, argv, i, context_size) == 1) {
+                    return 1;
+                }
+            } else if (options_parsing && (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "--ngl") == 0)) {
+                if (handle_option_with_value(argc, argv, i, ngl) == 1) {
+                    return 1;
+                }
+            } else if (options_parsing && strcmp(argv[i], "--temp") == 0) {
+                if (handle_option_with_value(argc, argv, i, temperature) == 1) {
                     return 1;
                 }
-            } else if (int_args.count(arg)) {
-                if (i + 1 < argc) {
-                    if (parse_int_arg(argv[++i], *int_args[arg]) != 0) {
-                        fprintf(stderr, "error: invalid value for %s: %s\n", arg.c_str(), argv[i]);
-                        print_usage();
-                        return 1;
-                    }
-                } else {
-                    fprintf(stderr, "error: missing value for %s\n", arg.c_str());
-                    print_usage();
+            } else if (options_parsing &&
+                       (parse_flag(argv, i, "-v", "--verbose") || parse_flag(argv, i, "-v", "--log-verbose"))) {
+                verbose = true;
+            } else if (options_parsing && parse_flag(argv, i, "-h", "--help")) {
+                help = true;
+                return 0;
+            } else if (options_parsing && strcmp(argv[i], "--") == 0) {
+                options_parsing = false;
+            } else if (positional_args_i == 0) {
+                if (!argv[i][0] || argv[i][0] == '-') {
                     return 1;
                 }
+
+                ++positional_args_i;
+                model_ = argv[i];
+            } else if (positional_args_i == 1) {
+                ++positional_args_i;
+                user = argv[i];
             } else {
-                fprintf(stderr, "error: unrecognized argument %s\n", arg.c_str());
-                print_usage();
+                user += " " + std::string(argv[i]);
+            }
+        }
+
+        return 0;
+    }
+
+    void print_help() const {
+        printf(
+            "Description:\n"
+            "  Runs a llm\n"
+            "\n"
+            "Usage:\n"
+            "  llama-run [options] model [prompt]\n"
+            "\n"
+            "Options:\n"
+            "  -c, --context-size <value>\n"
+            "      Context size (default: %d)\n"
+            "  -n, --ngl <value>\n"
+            "      Number of GPU layers (default: %d)\n"
+            "  --temp <value>\n"
+            "      Temperature (default: %.1f)\n"
+            "  -v, --verbose, --log-verbose\n"
+            "      Set verbosity level to infinity (i.e. log all messages, useful for debugging)\n"
+            "  -h, --help\n"
+            "      Show help message\n"
+            "\n"
+            "Commands:\n"
+            "  model\n"
+            "      Model is a string with an optional prefix of \n"
+            "      huggingface:// (hf://), ollama://, https:// or file://.\n"
+            "      If no protocol is specified and a file exists in the specified\n"
+            "      path, file:// is assumed, otherwise if a file does not exist in\n"
+            "      the specified path, ollama:// is assumed. Models that are being\n"
+            "      pulled are downloaded with .partial extension while being\n"
+            "      downloaded and then renamed as the file without the .partial\n"
+            "      extension when complete.\n"
+            "\n"
+            "Examples:\n"
+            "  llama-run llama3\n"
+            "  llama-run ollama://granite-code\n"
+            "  llama-run ollama://smollm:135m\n"
+            "  llama-run hf://QuantFactory/SmolLM-135M-GGUF/SmolLM-135M.Q2_K.gguf\n"
+            "  llama-run "
+            "huggingface://bartowski/SmolLM-1.7B-Instruct-v0.2-GGUF/SmolLM-1.7B-Instruct-v0.2-IQ3_M.gguf\n"
+            "  llama-run https://example.com/some-file1.gguf\n"
+            "  llama-run some-file2.gguf\n"
+            "  llama-run file://some-file3.gguf\n"
+            "  llama-run --ngl 999 some-file4.gguf\n"
+            "  llama-run --ngl 999 some-file5.gguf Hello World\n",
+            context_size_default, ngl_default, temperature_default);
+    }
+};
+
+struct progress_data {
+    size_t                                file_size  = 0;
+    std::chrono::steady_clock::time_point start_time = std::chrono::steady_clock::now();
+    bool                                  printed    = false;
+};
+
+static int get_terminal_width() {
+#if defined(_WIN32)
+    CONSOLE_SCREEN_BUFFER_INFO csbi;
+    GetConsoleScreenBufferInfo(GetStdHandle(STD_OUTPUT_HANDLE), &csbi);
+    return csbi.srWindow.Right - csbi.srWindow.Left + 1;
+#else
+    struct winsize w;
+    ioctl(STDOUT_FILENO, TIOCGWINSZ, &w);
+    return w.ws_col;
+#endif
+}
+
+#ifdef LLAMA_USE_CURL
+class File {
+  public:
+    FILE * file = nullptr;
+
+    FILE * open(const std::string & filename, const char * mode) {
+        file = fopen(filename.c_str(), mode);
+
+        return file;
+    }
+
+    int lock() {
+        if (file) {
+#    ifdef _WIN32
+            fd    = _fileno(file);
+            hFile = (HANDLE) _get_osfhandle(fd);
+            if (hFile == INVALID_HANDLE_VALUE) {
+                fd = -1;
+
+                return 1;
+            }
+
+            OVERLAPPED overlapped = { 0 };
+            if (!LockFileEx(hFile, LOCKFILE_EXCLUSIVE_LOCK | LOCKFILE_FAIL_IMMEDIATELY, 0, MAXDWORD, MAXDWORD,
+                            &overlapped)) {
+                fd = -1;
+
                 return 1;
             }
+#    else
+            fd = fileno(file);
+            if (flock(fd, LOCK_EX | LOCK_NB) != 0) {
+                fd = -1;
+
+                return 1;
+            }
+#    endif
         }
 
-        if (string_args["-m"]->empty()) {
-            fprintf(stderr, "error: -m is required\n");
-            print_usage();
+        return 0;
+    }
+
+    ~File() {
+        if (fd >= 0) {
+#    ifdef _WIN32
+            if (hFile != INVALID_HANDLE_VALUE) {
+                OVERLAPPED overlapped = { 0 };
+                UnlockFileEx(hFile, 0, MAXDWORD, MAXDWORD, &overlapped);
+            }
+#    else
+            flock(fd, LOCK_UN);
+#    endif
+        }
+
+        if (file) {
+            fclose(file);
+        }
+    }
+
+  private:
+    int fd = -1;
+#    ifdef _WIN32
+    HANDLE hFile;
+#    endif
+};
+
+class HttpClient {
+  public:
+    int init(const std::string & url, const std::vector<std::string> & headers, const std::string & output_file,
+             const bool progress, std::string * response_str = nullptr) {
+        std::string output_file_partial;
+        curl = curl_easy_init();
+        if (!curl) {
             return 1;
         }
 
+        progress_data data;
+        File          out;
+        if (!output_file.empty()) {
+            output_file_partial = output_file + ".partial";
+            if (!out.open(output_file_partial, "ab")) {
+                printe("Failed to open file\n");
+
+                return 1;
+            }
+
+            if (out.lock()) {
+                printe("Failed to exclusively lock file\n");
+
+                return 1;
+            }
+        }
+
+        set_write_options(response_str, out);
+        data.file_size = set_resume_point(output_file_partial);
+        set_progress_options(progress, data);
+        set_headers(headers);
+        perform(url);
+        if (!output_file.empty()) {
+            std::filesystem::rename(output_file_partial, output_file);
+        }
+
         return 0;
     }
 
-   private:
-    const char * program_name;
-    std::unordered_map<std::string, std::string *> string_args;
-    std::unordered_map<std::string, int *> int_args;
-    std::vector<Argument> arguments;
+    ~HttpClient() {
+        if (chunk) {
+            curl_slist_free_all(chunk);
+        }
+
+        if (curl) {
+            curl_easy_cleanup(curl);
+        }
+    }
+
+  private:
+    CURL *              curl  = nullptr;
+    struct curl_slist * chunk = nullptr;
+
+    void set_write_options(std::string * response_str, const File & out) {
+        if (response_str) {
+            curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, capture_data);
+            curl_easy_setopt(curl, CURLOPT_WRITEDATA, response_str);
+        } else {
+            curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_data);
+            curl_easy_setopt(curl, CURLOPT_WRITEDATA, out.file);
+        }
+    }
+
+    size_t set_resume_point(const std::string & output_file) {
+        size_t file_size = 0;
+        if (std::filesystem::exists(output_file)) {
+            file_size = std::filesystem::file_size(output_file);
+            curl_easy_setopt(curl, CURLOPT_RESUME_FROM_LARGE, static_cast<curl_off_t>(file_size));
+        }
+
+        return file_size;
+    }
+
+    void set_progress_options(bool progress, progress_data & data) {
+        if (progress) {
+            curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
+            curl_easy_setopt(curl, CURLOPT_XFERINFODATA, &data);
+            curl_easy_setopt(curl, CURLOPT_XFERINFOFUNCTION, update_progress);
+        }
+    }
+
+    void set_headers(const std::vector<std::string> & headers) {
+        if (!headers.empty()) {
+            if (chunk) {
+                curl_slist_free_all(chunk);
+                chunk = 0;
+            }
+
+            for (const auto & header : headers) {
+                chunk = curl_slist_append(chunk, header.c_str());
+            }
+
+            curl_easy_setopt(curl, CURLOPT_HTTPHEADER, chunk);
+        }
+    }
+
+    void perform(const std::string & url) {
+        CURLcode res;
+        curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
+        curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
+        curl_easy_setopt(curl, CURLOPT_DEFAULT_PROTOCOL, "https");
+        curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1L);
+        res = curl_easy_perform(curl);
+        if (res != CURLE_OK) {
+            printe("curl_easy_perform() failed: %s\n", curl_easy_strerror(res));
+        }
+    }
+
+    static std::string human_readable_time(double seconds) {
+        int hrs  = static_cast<int>(seconds) / 3600;
+        int mins = (static_cast<int>(seconds) % 3600) / 60;
+        int secs = static_cast<int>(seconds) % 60;
+
+        if (hrs > 0) {
+            return fmt("%dh %02dm %02ds", hrs, mins, secs);
+        } else if (mins > 0) {
+            return fmt("%dm %02ds", mins, secs);
+        } else {
+            return fmt("%ds", secs);
+        }
+    }
+
+    static std::string human_readable_size(curl_off_t size) {
+        static const char * suffix[] = { "B", "KB", "MB", "GB", "TB" };
+        char                length   = sizeof(suffix) / sizeof(suffix[0]);
+        int                 i        = 0;
+        double              dbl_size = size;
+        if (size > 1024) {
+            for (i = 0; (size / 1024) > 0 && i < length - 1; i++, size /= 1024) {
+                dbl_size = size / 1024.0;
+            }
+        }
+
+        return fmt("%.2f %s", dbl_size, suffix[i]);
+    }
 
-    int parse_int_arg(const char * arg, int & value) {
-        char * end;
-        const long val = std::strtol(arg, &end, 10);
-        if (*end == '\0' && val >= INT_MIN && val <= INT_MAX) {
-            value = static_cast<int>(val);
+    static int update_progress(void * ptr, curl_off_t total_to_download, curl_off_t now_downloaded, curl_off_t,
+                               curl_off_t) {
+        progress_data * data = static_cast<progress_data *>(ptr);
+        if (total_to_download <= 0) {
             return 0;
         }
-        return 1;
+
+        total_to_download += data->file_size;
+        const curl_off_t now_downloaded_plus_file_size = now_downloaded + data->file_size;
+        const curl_off_t percentage      = calculate_percentage(now_downloaded_plus_file_size, total_to_download);
+        std::string      progress_prefix = generate_progress_prefix(percentage);
+
+        const double speed = calculate_speed(now_downloaded, data->start_time);
+        const double tim   = (total_to_download - now_downloaded) / speed;
+        std::string  progress_suffix =
+            generate_progress_suffix(now_downloaded_plus_file_size, total_to_download, speed, tim);
+
+        int         progress_bar_width = calculate_progress_bar_width(progress_prefix, progress_suffix);
+        std::string progress_bar;
+        generate_progress_bar(progress_bar_width, percentage, progress_bar);
+
+        print_progress(progress_prefix, progress_bar, progress_suffix);
+        data->printed = true;
+
+        return 0;
     }
 
-    void print_usage() const {
-        printf("\nUsage:\n");
-        printf("  %s [OPTIONS]\n\n", program_name);
-        printf("Options:\n");
-        for (const auto & arg : arguments) {
-            printf("  %-10s %s\n", arg.flag.c_str(), arg.help_text.c_str());
+    static curl_off_t calculate_percentage(curl_off_t now_downloaded_plus_file_size, curl_off_t total_to_download) {
+        return (now_downloaded_plus_file_size * 100) / total_to_download;
+    }
+
+    static std::string generate_progress_prefix(curl_off_t percentage) { return fmt("%3ld%% |", percentage); }
+
+    static double calculate_speed(curl_off_t now_downloaded, const std::chrono::steady_clock::time_point & start_time) {
+        const auto                          now             = std::chrono::steady_clock::now();
+        const std::chrono::duration<double> elapsed_seconds = now - start_time;
+        return now_downloaded / elapsed_seconds.count();
+    }
+
+    static std::string generate_progress_suffix(curl_off_t now_downloaded_plus_file_size, curl_off_t total_to_download,
+                                                double speed, double estimated_time) {
+        const int width = 10;
+        return fmt("%*s/%*s%*s/s%*s", width, human_readable_size(now_downloaded_plus_file_size).c_str(), width,
+                   human_readable_size(total_to_download).c_str(), width, human_readable_size(speed).c_str(), width,
+                   human_readable_time(estimated_time).c_str());
+    }
+
+    static int calculate_progress_bar_width(const std::string & progress_prefix, const std::string & progress_suffix) {
+        int progress_bar_width = get_terminal_width() - progress_prefix.size() - progress_suffix.size() - 3;
+        if (progress_bar_width < 1) {
+            progress_bar_width = 1;
         }
 
-        printf("\n");
+        return progress_bar_width;
+    }
+
+    static std::string generate_progress_bar(int progress_bar_width, curl_off_t percentage,
+                                             std::string & progress_bar) {
+        const curl_off_t pos = (percentage * progress_bar_width) / 100;
+        for (int i = 0; i < progress_bar_width; ++i) {
+            progress_bar.append((i < pos) ? "█" : " ");
+        }
+
+        return progress_bar;
+    }
+
+    static void print_progress(const std::string & progress_prefix, const std::string & progress_bar,
+                               const std::string & progress_suffix) {
+        printe("\r%*s\r%s%s| %s", get_terminal_width(), " ", progress_prefix.c_str(), progress_bar.c_str(),
+               progress_suffix.c_str());
+    }
+    // Function to write data to a file
+    static size_t write_data(void * ptr, size_t size, size_t nmemb, void * stream) {
+        FILE * out = static_cast<FILE *>(stream);
+        return fwrite(ptr, size, nmemb, out);
+    }
+
+    // Function to capture data into a string
+    static size_t capture_data(void * ptr, size_t size, size_t nmemb, void * stream) {
+        std::string * str = static_cast<std::string *>(stream);
+        str->append(static_cast<char *>(ptr), size * nmemb);
+        return size * nmemb;
     }
 };
+#endif
 
 class LlamaData {
-   public:
-    llama_model_ptr model;
-    llama_sampler_ptr sampler;
-    llama_context_ptr context;
+  public:
+    llama_model_ptr                 model;
+    llama_sampler_ptr               sampler;
+    llama_context_ptr               context;
     std::vector<llama_chat_message> messages;
+    std::vector<std::string>        msg_strs;
+    std::vector<char>               fmtted;
 
-    int init(const Options & opt) {
-        model = initialize_model(opt.model_path, opt.ngl);
+    int init(Opt & opt) {
+        model = initialize_model(opt);
         if (!model) {
             return 1;
         }
 
-        context = initialize_context(model, opt.n_ctx);
+        context = initialize_context(model, opt);
         if (!context) {
             return 1;
         }
 
-        sampler = initialize_sampler();
+        sampler = initialize_sampler(opt);
         return 0;
     }
 
-   private:
-    // Initializes the model and returns a unique pointer to it
-    llama_model_ptr initialize_model(const std::string & model_path, const int ngl) {
-        llama_model_params model_params = llama_model_default_params();
-        model_params.n_gpu_layers = ngl;
+  private:
+#ifdef LLAMA_USE_CURL
+    int download(const std::string & url, const std::vector<std::string> & headers, const std::string & output_file,
+                 const bool progress, std::string * response_str = nullptr) {
+        HttpClient http;
+        if (http.init(url, headers, output_file, progress, response_str)) {
+            return 1;
+        }
+
+        return 0;
+    }
+#else
+    int download(const std::string &, const std::vector<std::string> &, const std::string &, const bool,
+                 std::string * = nullptr) {
+        printe("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
+        return 1;
+    }
+#endif
+
+    int huggingface_dl(const std::string & model, const std::vector<std::string> headers, const std::string & bn) {
+        // Find the second occurrence of '/' after protocol string
+        size_t pos = model.find('/');
+        pos        = model.find('/', pos + 1);
+        if (pos == std::string::npos) {
+            return 1;
+        }
 
-        llama_model_ptr model(llama_load_model_from_file(model_path.c_str(), model_params));
+        const std::string hfr = model.substr(0, pos);
+        const std::string hff = model.substr(pos + 1);
+        const std::string url = "https://huggingface.co/" + hfr + "/resolve/main/" + hff;
+        return download(url, headers, bn, true);
+    }
+
+    int ollama_dl(std::string & model, const std::vector<std::string> headers, const std::string & bn) {
+        if (model.find('/') == std::string::npos) {
+            model = "library/" + model;
+        }
+
+        std::string model_tag = "latest";
+        size_t      colon_pos = model.find(':');
+        if (colon_pos != std::string::npos) {
+            model_tag = model.substr(colon_pos + 1);
+            model     = model.substr(0, colon_pos);
+        }
+
+        std::string manifest_url = "https://registry.ollama.ai/v2/" + model + "/manifests/" + model_tag;
+        std::string manifest_str;
+        const int   ret = download(manifest_url, headers, "", false, &manifest_str);
+        if (ret) {
+            return ret;
+        }
+
+        nlohmann::json manifest = nlohmann::json::parse(manifest_str);
+        std::string    layer;
+        for (const auto & l : manifest["layers"]) {
+            if (l["mediaType"] == "application/vnd.ollama.image.model") {
+                layer = l["digest"];
+                break;
+            }
+        }
+
+        std::string blob_url = "https://registry.ollama.ai/v2/" + model + "/blobs/" + layer;
+        return download(blob_url, headers, bn, true);
+    }
+
+    std::string basename(const std::string & path) {
+        const size_t pos = path.find_last_of("/\\");
+        if (pos == std::string::npos) {
+            return path;
+        }
+
+        return path.substr(pos + 1);
+    }
+
+    int remove_proto(std::string & model_) {
+        const std::string::size_type pos = model_.find("://");
+        if (pos == std::string::npos) {
+            return 1;
+        }
+
+        model_ = model_.substr(pos + 3);  // Skip past "://"
+        return 0;
+    }
+
+    int resolve_model(std::string & model_) {
+        int                            ret     = 0;
+        if (string_starts_with(model_, "file://") || std::filesystem::exists(model_)) {
+            remove_proto(model_);
+
+            return ret;
+        }
+
+        const std::string              bn      = basename(model_);
+        const std::vector<std::string> headers = { "--header",
+                                                   "Accept: application/vnd.docker.distribution.manifest.v2+json" };
+        if (string_starts_with(model_, "hf://") || string_starts_with(model_, "huggingface://")) {
+            remove_proto(model_);
+            ret = huggingface_dl(model_, headers, bn);
+        } else if (string_starts_with(model_, "ollama://")) {
+            remove_proto(model_);
+            ret = ollama_dl(model_, headers, bn);
+        } else if (string_starts_with(model_, "https://")) {
+            download(model_, headers, bn, true);
+        } else {
+            ret = ollama_dl(model_, headers, bn);
+        }
+
+        model_ = bn;
+
+        return ret;
+    }
+
+    // Initializes the model and returns a unique pointer to it
+    llama_model_ptr initialize_model(Opt & opt) {
+        ggml_backend_load_all();
+        resolve_model(opt.model_);
+        printe(
+            "\r%*s"
+            "\rLoading model",
+            get_terminal_width(), " ");
+        llama_model_ptr model(llama_load_model_from_file(opt.model_.c_str(), opt.model_params));
         if (!model) {
-            fprintf(stderr, "%s: error: unable to load model\n", __func__);
+            printe("%s: error: unable to load model from file: %s\n", __func__, opt.model_.c_str());
         }
 
+        printe("\r%*s\r", static_cast<int>(sizeof("Loading model")), " ");
         return model;
     }
 
     // Initializes the context with the specified parameters
-    llama_context_ptr initialize_context(const llama_model_ptr & model, const int n_ctx) {
-        llama_context_params ctx_params = llama_context_default_params();
-        ctx_params.n_ctx = n_ctx;
-        ctx_params.n_batch = n_ctx;
-
-        llama_context_ptr context(llama_new_context_with_model(model.get(), ctx_params));
+    llama_context_ptr initialize_context(const llama_model_ptr & model, const Opt & opt) {
+        llama_context_ptr context(llama_new_context_with_model(model.get(), opt.ctx_params));
         if (!context) {
-            fprintf(stderr, "%s: error: failed to create the llama_context\n", __func__);
+            printe("%s: error: failed to create the llama_context\n", __func__);
         }
 
         return context;
     }
 
     // Initializes and configures the sampler
-    llama_sampler_ptr initialize_sampler() {
+    llama_sampler_ptr initialize_sampler(const Opt & opt) {
         llama_sampler_ptr sampler(llama_sampler_chain_init(llama_sampler_chain_default_params()));
         llama_sampler_chain_add(sampler.get(), llama_sampler_init_min_p(0.05f, 1));
-        llama_sampler_chain_add(sampler.get(), llama_sampler_init_temp(0.8f));
+        llama_sampler_chain_add(sampler.get(), llama_sampler_init_temp(opt.temperature));
         llama_sampler_chain_add(sampler.get(), llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
 
         return sampler;
     }
 };
 
-// Add a message to `messages` and store its content in `owned_content`
-static void add_message(const char * role, const std::string & text, LlamaData & llama_data,
-                        std::vector<char_array_ptr> & owned_content) {
-    char_array_ptr content(new char[text.size() + 1]);
-    std::strcpy(content.get(), text.c_str());
-    llama_data.messages.push_back({role, content.get()});
-    owned_content.push_back(std::move(content));
+// Add a message to `messages` and store its content in `msg_strs`
+static void add_message(const char * role, const std::string & text, LlamaData & llama_data) {
+    llama_data.msg_strs.push_back(std::move(text));
+    llama_data.messages.push_back({ role, llama_data.msg_strs.back().c_str() });
 }
 
 // Function to apply the chat template and resize `formatted` if needed
-static int apply_chat_template(const LlamaData & llama_data, std::vector<char> & formatted, const bool append) {
-    int result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(),
-                                           llama_data.messages.size(), append, formatted.data(), formatted.size());
-    if (result > static_cast<int>(formatted.size())) {
-        formatted.resize(result);
+static int apply_chat_template(LlamaData & llama_data, const bool append) {
+    int result = llama_chat_apply_template(
+        llama_data.model.get(), nullptr, llama_data.messages.data(), llama_data.messages.size(), append,
+        append ? llama_data.fmtted.data() : nullptr, append ? llama_data.fmtted.size() : 0);
+    if (append && result > static_cast<int>(llama_data.fmtted.size())) {
+        llama_data.fmtted.resize(result);
         result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(),
-                                           llama_data.messages.size(), append, formatted.data(), formatted.size());
+                                           llama_data.messages.size(), append, llama_data.fmtted.data(),
+                                           llama_data.fmtted.size());
     }
 
     return result;
@@ -199,7 +721,8 @@ static int tokenize_prompt(const llama_model_ptr & model, const std::string & pr
     prompt_tokens.resize(n_prompt_tokens);
     if (llama_tokenize(model.get(), prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true,
                        true) < 0) {
-        GGML_ABORT("failed to tokenize the prompt\n");
+        printe("failed to tokenize the prompt\n");
+        return -1;
     }
 
     return n_prompt_tokens;
@@ -207,11 +730,11 @@ static int tokenize_prompt(const llama_model_ptr & model, const std::string & pr
 
 // Check if we have enough space in the context to evaluate this batch
 static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) {
-    const int n_ctx = llama_n_ctx(ctx.get());
+    const int n_ctx      = llama_n_ctx(ctx.get());
     const int n_ctx_used = llama_get_kv_cache_used_cells(ctx.get());
     if (n_ctx_used + batch.n_tokens > n_ctx) {
         printf("\033[0m\n");
-        fprintf(stderr, "context size exceeded\n");
+        printe("context size exceeded\n");
         return 1;
     }
 
@@ -221,9 +744,10 @@ static int check_context_size(const llama_context_ptr & ctx, const llama_batch &
 // convert the token to a string
 static int convert_token_to_string(const llama_model_ptr & model, const llama_token token_id, std::string & piece) {
     char buf[256];
-    int n = llama_token_to_piece(model.get(), token_id, buf, sizeof(buf), 0, true);
+    int  n = llama_token_to_piece(model.get(), token_id, buf, sizeof(buf), 0, true);
     if (n < 0) {
-        GGML_ABORT("failed to convert token to piece\n");
+        printe("failed to convert token to piece\n");
+        return 1;
     }
 
     piece = std::string(buf, n);
@@ -238,19 +762,19 @@ static void print_word_and_concatenate_to_response(const std::string & piece, st
 
 // helper function to evaluate a prompt and generate a response
 static int generate(LlamaData & llama_data, const std::string & prompt, std::string & response) {
-    std::vector<llama_token> prompt_tokens;
-    const int n_prompt_tokens = tokenize_prompt(llama_data.model, prompt, prompt_tokens);
-    if (n_prompt_tokens < 0) {
+    std::vector<llama_token> tokens;
+    if (tokenize_prompt(llama_data.model, prompt, tokens) < 0) {
         return 1;
     }
 
     // prepare a batch for the prompt
-    llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
+    llama_batch batch = llama_batch_get_one(tokens.data(), tokens.size());
     llama_token new_token_id;
     while (true) {
         check_context_size(llama_data.context, batch);
         if (llama_decode(llama_data.context.get(), batch)) {
-            GGML_ABORT("failed to decode\n");
+            printe("failed to decode\n");
+            return 1;
         }
 
         // sample the next token, check is it an end of generation?
@@ -273,44 +797,34 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
     return 0;
 }
 
-static int parse_arguments(const int argc, const char ** argv, Options & opt) {
-    ArgumentParser parser(argv[0]);
-    parser.add_argument("-m", opt.model_path, "model");
-    parser.add_argument("-p", opt.prompt_non_interactive, "prompt");
-    parser.add_argument("-c", opt.n_ctx, "context_size");
-    parser.add_argument("-ngl", opt.ngl, "n_gpu_layers");
-    if (parser.parse(argc, argv)) {
-        return 1;
-    }
-
-    return 0;
-}
-
 static int read_user_input(std::string & user) {
     std::getline(std::cin, user);
-    return user.empty();  // Indicate an error or empty input
+    return user.empty();  // Should have data in happy path
 }
 
 // Function to generate a response based on the prompt
-static int generate_response(LlamaData & llama_data, const std::string & prompt, std::string & response) {
+static int generate_response(LlamaData & llama_data, const std::string & prompt, std::string & response,
+                             const bool stdout_a_terminal) {
     // Set response color
-    printf("\033[33m");
+    if (stdout_a_terminal) {
+        printf("\033[33m");
+    }
+
     if (generate(llama_data, prompt, response)) {
-        fprintf(stderr, "failed to generate response\n");
+        printe("failed to generate response\n");
         return 1;
     }
 
     // End response with color reset and newline
-    printf("\n\033[0m");
+    printf("\n%s", stdout_a_terminal ? "\033[0m" : "");
     return 0;
 }
 
 // Helper function to apply the chat template and handle errors
-static int apply_chat_template_with_error_handling(const LlamaData & llama_data, std::vector<char> & formatted,
-                                                   const bool is_user_input, int & output_length) {
-    const int new_len = apply_chat_template(llama_data, formatted, is_user_input);
+static int apply_chat_template_with_error_handling(LlamaData & llama_data, const bool append, int & output_length) {
+    const int new_len = apply_chat_template(llama_data, append);
     if (new_len < 0) {
-        fprintf(stderr, "failed to apply the chat template\n");
+        printe("failed to apply the chat template\n");
         return -1;
     }
 
@@ -319,62 +833,82 @@ static int apply_chat_template_with_error_handling(const LlamaData & llama_data,
 }
 
 // Helper function to handle user input
-static bool handle_user_input(std::string & user_input, const std::string & prompt_non_interactive) {
-    if (!prompt_non_interactive.empty()) {
-        user_input = prompt_non_interactive;
-        return true;  // No need for interactive input
+static int handle_user_input(std::string & user_input, const std::string & user) {
+    if (!user.empty()) {
+        user_input = user;
+        return 0;  // No need for interactive input
     }
 
-    printf("\033[32m> \033[0m");
-    return !read_user_input(user_input);  // Returns false if input ends the loop
+    printf(
+        "\r%*s"
+        "\r\033[32m> \033[0m",
+        get_terminal_width(), " ");
+    return read_user_input(user_input);  // Returns true if input ends the loop
+}
+
+static bool is_stdin_a_terminal() {
+#if defined(_WIN32)
+    HANDLE hStdin = GetStdHandle(STD_INPUT_HANDLE);
+    DWORD  mode;
+    return GetConsoleMode(hStdin, &mode);
+#else
+    return isatty(STDIN_FILENO);
+#endif
+}
+
+static bool is_stdout_a_terminal() {
+#if defined(_WIN32)
+    HANDLE hStdout = GetStdHandle(STD_OUTPUT_HANDLE);
+    DWORD  mode;
+    return GetConsoleMode(hStdout, &mode);
+#else
+    return isatty(STDOUT_FILENO);
+#endif
 }
 
 // Function to tokenize the prompt
-static int chat_loop(LlamaData & llama_data, std::string & prompt_non_interactive) {
-    std::vector<char_array_ptr> owned_content;
-    std::vector<char> fmtted(llama_n_ctx(llama_data.context.get()));
+static int chat_loop(LlamaData & llama_data, const std::string & user) {
     int prev_len = 0;
-
+    llama_data.fmtted.resize(llama_n_ctx(llama_data.context.get()));
+    static const bool stdout_a_terminal = is_stdout_a_terminal();
     while (true) {
         // Get user input
         std::string user_input;
-        if (!handle_user_input(user_input, prompt_non_interactive)) {
-            break;
+        while (handle_user_input(user_input, user)) {
         }
 
-        add_message("user", prompt_non_interactive.empty() ? user_input : prompt_non_interactive, llama_data,
-                    owned_content);
-
+        add_message("user", user.empty() ? user_input : user, llama_data);
         int new_len;
-        if (apply_chat_template_with_error_handling(llama_data, fmtted, true, new_len) < 0) {
+        if (apply_chat_template_with_error_handling(llama_data, true, new_len) < 0) {
             return 1;
         }
 
-        std::string prompt(fmtted.begin() + prev_len, fmtted.begin() + new_len);
+        std::string prompt(llama_data.fmtted.begin() + prev_len, llama_data.fmtted.begin() + new_len);
         std::string response;
-        if (generate_response(llama_data, prompt, response)) {
+        if (generate_response(llama_data, prompt, response, stdout_a_terminal)) {
+            return 1;
+        }
+
+        if (!user.empty()) {
+            break;
+        }
+
+        add_message("assistant", response, llama_data);
+        if (apply_chat_template_with_error_handling(llama_data, false, prev_len) < 0) {
             return 1;
         }
     }
+
     return 0;
 }
 
-static void log_callback(const enum ggml_log_level level, const char * text, void *) {
-    if (level == GGML_LOG_LEVEL_ERROR) {
-        fprintf(stderr, "%s", text);
+static void log_callback(const enum ggml_log_level level, const char * text, void * p) {
+    const Opt * opt = static_cast<Opt *>(p);
+    if (opt->verbose || level == GGML_LOG_LEVEL_ERROR) {
+        printe("%s", text);
     }
 }
 
-static bool is_stdin_a_terminal() {
-#if defined(_WIN32)
-    HANDLE hStdin = GetStdHandle(STD_INPUT_HANDLE);
-    DWORD mode;
-    return GetConsoleMode(hStdin, &mode);
-#else
-    return isatty(STDIN_FILENO);
-#endif
-}
-
 static std::string read_pipe_data() {
     std::ostringstream result;
     result << std::cin.rdbuf();  // Read all data from std::cin
@@ -382,26 +916,29 @@ static std::string read_pipe_data() {
 }
 
 int main(int argc, const char ** argv) {
-    Options opt;
-    if (parse_arguments(argc, argv, opt)) {
+    Opt       opt;
+    const int ret = opt.init(argc, argv);
+    if (ret == 2) {
+        return 0;
+    } else if (ret) {
         return 1;
     }
 
     if (!is_stdin_a_terminal()) {
-        if (!opt.prompt_non_interactive.empty()) {
-            opt.prompt_non_interactive += "\n\n";
+        if (!opt.user.empty()) {
+            opt.user += "\n\n";
         }
 
-        opt.prompt_non_interactive += read_pipe_data();
+        opt.user += read_pipe_data();
     }
 
-    llama_log_set(log_callback, nullptr);
+    llama_log_set(log_callback, &opt);
     LlamaData llama_data;
     if (llama_data.init(opt)) {
         return 1;
     }
 
-    if (chat_loop(llama_data, opt.prompt_non_interactive)) {
+    if (chat_loop(llama_data, opt.user)) {
         return 1;
     }
 
diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt
index 63fca1d590f..1b7cc8c1328 100644
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -15,7 +15,7 @@ set(TARGET_SRCS
     httplib.h
 )
 set(PUBLIC_ASSETS
-    index.html
+    index.html.gz
     loading.html
 )
 
@@ -34,6 +34,7 @@ endforeach()
 add_executable(${TARGET} ${TARGET_SRCS})
 install(TARGETS ${TARGET} RUNTIME)
 
+target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
 target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
 
 if (LLAMA_SERVER_SSL)
diff --git a/examples/server/README.md b/examples/server/README.md
index 6294f541fc7..c7d91be9976 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -62,8 +62,8 @@ The project is under active development, and we are [looking for feedback and co
 | `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
 | `-dkvc, --dump-kv-cache` | verbose print of the KV cache |
 | `-nkvo, --no-kv-offload` | disable KV offload<br/>(env: LLAMA_ARG_NO_KV_OFFLOAD) |
-| `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
-| `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
+| `-ctk, --cache-type-k TYPE` | KV cache data type for K<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
+| `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
 | `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: 0.1, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
 | `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
 | `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
@@ -104,7 +104,6 @@ The project is under active development, and we are [looking for feedback and co
 | `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) |
 | `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: dkypmxt) |
 | `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
-| `--penalize-nl` | penalize newline tokens (default: false) |
 | `--temp N` | temperature (default: 0.8) |
 | `--top-k N` | top-k sampling (default: 40, 0 = disabled) |
 | `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) |
@@ -138,6 +137,7 @@ The project is under active development, and we are [looking for feedback and co
 | -------- | ----------- |
 | `--no-context-shift` | disables context shift on inifinite text generation (default: disabled)<br/>(env: LLAMA_ARG_NO_CONTEXT_SHIFT) |
 | `-sp, --special` | special tokens output enabled (default: false) |
+| `--no-warmup` | skip warming up the model with an empty run |
 | `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
 | `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified<br/>(env: LLAMA_ARG_POOLING) |
 | `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
@@ -146,7 +146,7 @@ The project is under active development, and we are [looking for feedback and co
 | `--host HOST` | ip address to listen (default: 127.0.0.1)<br/>(env: LLAMA_ARG_HOST) |
 | `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
 | `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) |
-| `--no-webui` | disable the Web UI<br/>(env: LLAMA_ARG_NO_WEBUI) |
+| `--no-webui` | Disable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_NO_WEBUI) |
 | `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
 | `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
 | `--api-key KEY` | API key to use for authentication (default: none)<br/>(env: LLAMA_API_KEY) |
@@ -164,13 +164,13 @@ The project is under active development, and we are [looking for feedback and co
 | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>list of built-in templates:<br/>chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, exaone3, gemma, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, monarch, openchat, orion, phi3, rwkv-world, vicuna, vicuna-orca, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
 | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)<br/> |
 | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
-| `--draft-max, --draft, --draft-n N` | number of tokens to draft for speculative decoding (default: 16) |
-| `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 5) |
-| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.9) |
-| `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model) |
+| `--draft-max, --draft, --draft-n N` | number of tokens to draft for speculative decoding (default: 16)<br/>(env: LLAMA_ARG_DRAFT_MAX) |
+| `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 5)<br/>(env: LLAMA_ARG_DRAFT_MIN) |
+| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.9)<br/>(env: LLAMA_ARG_DRAFT_P_MIN) |
+| `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE_DRAFT) |
 | `-devd, --device-draft <dev1,dev2,..>` | comma-separated list of devices to use for offloading the draft model (none = don't offload)<br/>use --list-devices to see a list of available devices |
-| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | number of layers to store in VRAM for the draft model |
-| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused) |
+| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | number of layers to store in VRAM for the draft model<br/>(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
+| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused)<br/>(env: LLAMA_ARG_MODEL_DRAFT) |
 
 
 Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var.
@@ -303,23 +303,23 @@ mkdir llama-client
 cd llama-client
 ```
 
-Create a index.js file and put this inside:
+Create an index.js file and put this inside:
 
 ```javascript
-const prompt = `Building a website can be done in 10 simple steps:`;
+const prompt = "Building a website can be done in 10 simple steps:"
 
-async function Test() {
+async function test() {
     let response = await fetch("http://127.0.0.1:8080/completion", {
-        method: 'POST',
+        method: "POST",
         body: JSON.stringify({
             prompt,
-            n_predict: 512,
+            n_predict: 64,
         })
     })
     console.log((await response.json()).content)
 }
 
-Test()
+test()
 ```
 
 And run it:
@@ -343,6 +343,10 @@ node index.js
 
 ### POST `/completion`: Given a `prompt`, it returns the predicted completion.
 
+> [!IMPORTANT]
+>
+> This endpoint is **not** OAI-compatible
+
 *Options:*
 
 `prompt`: Provide the prompt for this completion as a string or as an array of strings or numbers representing tokens. Internally, if `cache_prompt` is `true`, the prompt is compared to the previous completion and only the "unseen" suffix is evaluated. A `BOS` token is inserted at the start, if all of the following conditions are true:
@@ -381,7 +385,7 @@ Multiple prompts are also supported. In this case, the completion result will be
 `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. The number excludes the BOS token.
 By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt.
 
-`stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
+`stream`: Allows receiving each predicted token in real-time instead of waiting for the completion to finish (uses a different response format). To enable this, set to `true`.
 
 `stop`: Specify a JSON array of stopping strings.
 These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. Default: `[]`
@@ -392,8 +396,6 @@ These words will not be included in the completion, so make sure to add them to
 
 `repeat_last_n`: Last n tokens to consider for penalizing repetition. Default: `64`, where `0` is disabled and `-1` is ctx-size.
 
-`penalize_nl`: Penalize newline tokens when applying the repeat penalty. Default: `true`
-
 `presence_penalty`: Repeat alpha presence penalty. Default: `0.0`, which is disabled.
 
 `frequency_penalty`: Repeat alpha frequency penalty. Default: `0.0`, which is disabled.
@@ -440,40 +442,76 @@ These words will not be included in the completion, so make sure to add them to
 
 `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `true`
 
+`return_tokens`: Return the raw generated token ids in the `tokens` field. Otherwise `tokens` remains empty. Default: `false`
+
 `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values.
 
-    `timings_per_token`: Include prompt processing and text generation speed information in each response.  Default: `false`
+`timings_per_token`: Include prompt processing and text generation speed information in each response.  Default: `false`
 
-**Response format**
+`post_sampling_probs`: Returns the probabilities of top `n_probs` tokens after applying sampling chain.
 
-- Note: When using streaming mode (`stream`), only `content` and `stop` will be returned until end of completion.
+`response_fields`: A list of response fields, for example: `"response_fields": ["content", "generation_settings/n_predict"]`. If the specified field is missing, it will simply be omitted from the response without triggering an error.
 
-- `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has the following structure:
+**Response format**
 
-```json
-{
-  "content": "<the token selected by the model>",
-  "probs": [
-    {
-      "prob": float,
-      "tok_str": "<most likely token>"
-    },
-    {
-      "prob": float,
-      "tok_str": "<second most likely token>"
-    },
-    ...
-  ]
-},
-```
+- Note: In streaming mode (`stream`), only `content`, `tokens` and `stop` will be returned until end of completion. Responses are sent using the [Server-sent events](https://html.spec.whatwg.org/multipage/server-sent-events.html) standard. Note: the browser's `EventSource` interface cannot be used due to its lack of `POST` request support.
 
-Notice that each `probs` is an array of length `n_probs`.
+- `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has a nested array `top_logprobs`. It contains at **maximum** `n_probs` elements:
+  ```json
+  {
+    "content": "<the generated completion text>",
+    "tokens": [ generated token ids if requested ],
+    ...
+    "probs": [
+      {
+        "id": <token id>,
+        "logprob": float,
+        "token": "<most likely token>",
+        "bytes": [int, int, ...],
+        "top_logprobs": [
+          {
+            "id": <token id>,
+            "logprob": float,
+            "token": "<token text>",
+            "bytes": [int, int, ...],
+          },
+          {
+            "id": <token id>,
+            "logprob": float,
+            "token": "<token text>",
+            "bytes": [int, int, ...],
+          },
+          ...
+        ]
+      },
+      {
+        "id": <token id>,
+        "logprob": float,
+        "token": "<most likely token>",
+        "bytes": [int, int, ...],
+        "top_logprobs": [
+          ...
+        ]
+      },
+      ...
+    ]
+  },
+  ```
+  Please note that if `post_sampling_probs` is set to `true`:
+    - `logprob` will be replaced with `prob`, with the value between 0.0 and 1.0
+    - `top_logprobs` will be replaced with `top_probs`. Each element contains:
+      - `id`: token ID
+      - `token`: token in string
+      - `bytes`: token in bytes
+      - `prob`: token probability, with the value between 0.0 and 1.0
+    - Number of elements in `top_probs` may be less than `n_probs`
 
 - `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
+- `tokens`: Same as `content` but represented as raw token ids. Only populated if `"return_tokens": true` or `"stream": true` in the request.
 - `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options)
 - `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`. These options may differ from the original ones in some way (e.g. bad values filtered out, strings converted to tokens, etc.).
-- `model`: The path to the model loaded with `-m`
-- `prompt`: The provided `prompt`
+- `model`: The model alias (for model path, please use `/props` endpoint)
+- `prompt`: The processed `prompt` (special tokens may be added)
 - `stop_type`: Indicating whether the completion has stopped. Possible values are:
   - `none`: Generating (not stopped)
   - `eos`: Stopped because it encountered the EOS token
@@ -654,7 +692,6 @@ This endpoint is public (no API key check). By default, it is read-only. To make
       "mirostat": 0,
       "mirostat_tau": 5.0,
       "mirostat_eta": 0.10000000149011612,
-      "penalize_nl": false,
       "stop": [],
       "max_tokens": -1,
       "n_keep": 0,
@@ -689,7 +726,8 @@ This endpoint is public (no API key check). By default, it is read-only. To make
   },
   "total_slots": 1,
   "model_path": "../models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf",
-  "chat_template": "..."
+  "chat_template": "...",
+  "build_info": "b(build number)-(build commit hash)"
 }
 ```
 
@@ -762,6 +800,8 @@ curl http://localhost:8080/v1/chat/completions \
 
 ### POST `/v1/embeddings`: OpenAI-compatible embeddings API
 
+This endpoint requires that the model uses a pooling different than type `none`. The embeddings are normalized using the Eucledian norm.
+
 *Options:*
 
 See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-reference/embeddings).
@@ -794,6 +834,46 @@ See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-r
   }'
   ```
 
+### POST `/embeddings`: non-OpenAI-compatible embeddings API
+
+This endpoint supports all poolings, including `--pooling none`. When the pooling is `none`, the responses will contain the *unnormalized* embeddings for *all* input tokens. For all other pooling types, only the pooled embeddings are returned, normalized using Euclidian norm.
+
+Note that the response format of this endpoint is different from `/v1/embeddings`.
+
+*Options:*
+
+Same as the `/v1/embeddings` endpoint.
+
+*Examples:*
+
+Same as the `/v1/embeddings` endpoint.
+
+**Response format**
+
+```json
+[
+  {
+    "index": 0,
+    "embedding": [
+      [ ... embeddings for token 0   ... ],
+      [ ... embeddings for token 1   ... ],
+      [ ... ]
+      [ ... embeddings for token N-1 ... ],
+    ]
+  },
+  ...
+  {
+    "index": P,
+    "embedding": [
+      [ ... embeddings for token 0   ... ],
+      [ ... embeddings for token 1   ... ],
+      [ ... ]
+      [ ... embeddings for token N-1 ... ],
+    ]
+  }
+]
+```
+
 ### GET `/slots`: Returns the current slots processing state
 
 > [!WARNING]
@@ -844,7 +924,6 @@ Example:
       "mirostat": 0,
       "mirostat_tau": 5.0,
       "mirostat_eta": 0.10000000149011612,
-      "penalize_nl": false,
       "stop": [],
       "max_tokens": -1,
       "n_keep": 0,
diff --git a/examples/server/public/index.html b/examples/server/public/index.html
deleted file mode 100644
index 250729a4412..00000000000
--- a/examples/server/public/index.html
+++ /dev/null
@@ -1,351 +0,0 @@
-<!--
-  This is a single file build of the frontend.
-  It is automatically generated by the build process.
-  Do not edit this file directly.
-  To make changes, refer to the "Web UI" section in the README.
--->
-<!DOCTYPE html>
-<html>
-<head>
-  <meta charset="UTF-8">
-  <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
-  <meta name="color-scheme" content="light dark">
-  <title>🦙 llama.cpp - chat</title>
-  <script type="module" crossorigin>(function(){const t=document.createElement("link").relList;if(t&&t.supports&&t.supports("modulepreload"))return;for(const r of document.querySelectorAll('link[rel="modulepreload"]'))n(r);new MutationObserver(r=>{for(const i of r)if(i.type==="childList")for(const s of i.addedNodes)s.tagName==="LINK"&&s.rel==="modulepreload"&&n(s)}).observe(document,{childList:!0,subtree:!0});function u(r){const i={};return r.integrity&&(i.integrity=r.integrity),r.referrerPolicy&&(i.referrerPolicy=r.referrerPolicy),r.crossOrigin==="use-credentials"?i.credentials="include":r.crossOrigin==="anonymous"?i.credentials="omit":i.credentials="same-origin",i}function n(r){if(r.ep)return;r.ep=!0;const i=u(r);fetch(r.href,i)}})();/**
-* @vue/shared v3.5.13
-* (c) 2018-present Yuxi (Evan) You and Vue contributors
-* @license MIT
-**//*! #__NO_SIDE_EFFECTS__ */function Be(e){const t=Object.create(null);for(const u of e.split(","))t[u]=1;return u=>u in t}const ee={},Pu=[],we=()=>{},dn=()=>!1,vu=e=>e.charCodeAt(0)===111&&e.charCodeAt(1)===110&&(e.charCodeAt(2)>122||e.charCodeAt(2)<97),Rs=e=>e.startsWith("onUpdate:"),te=Object.assign,Ms=(e,t)=>{const u=e.indexOf(t);u>-1&&e.splice(u,1)},Uf=Object.prototype.hasOwnProperty,re=(e,t)=>Uf.call(e,t),$=Array.isArray,Lu=e=>tn(e)==="[object Map]",Du=e=>tn(e)==="[object Set]",Ko=e=>tn(e)==="[object Date]",qf=e=>tn(e)==="[object RegExp]",W=e=>typeof e=="function",Z=e=>typeof e=="string",Ze=e=>typeof e=="symbol",oe=e=>e!==null&&typeof e=="object",Ps=e=>(oe(e)||W(e))&&W(e.then)&&W(e.catch),yl=Object.prototype.toString,tn=e=>yl.call(e),jf=e=>tn(e).slice(8,-1),ti=e=>tn(e)==="[object Object]",Ls=e=>Z(e)&&e!=="NaN"&&e[0]!=="-"&&""+parseInt(e,10)===e,tu=Be(",key,ref,ref_for,ref_key,onVnodeBeforeMount,onVnodeMounted,onVnodeBeforeUpdate,onVnodeUpdated,onVnodeBeforeUnmount,onVnodeUnmounted"),zf=Be("bind,cloak,else-if,else,for,html,if,model,on,once,pre,show,slot,text,memo"),ui=e=>{const t=Object.create(null);return u=>t[u]||(t[u]=e(u))},Kf=/-(\w)/g,de=ui(e=>e.replace(Kf,(t,u)=>u?u.toUpperCase():"")),Wf=/\B([A-Z])/g,qe=ui(e=>e.replace(Wf,"-$1").toLowerCase()),Tu=ui(e=>e.charAt(0).toUpperCase()+e.slice(1)),Bu=ui(e=>e?`on${Tu(e)}`:""),He=(e,t)=>!Object.is(e,t),Vu=(e,...t)=>{for(let u=0;u<e.length;u++)e[u](...t)},ju=(e,t,u,n=!1)=>{Object.defineProperty(e,t,{configurable:!0,enumerable:!1,writable:n,value:u})},Sr=e=>{const t=parseFloat(e);return isNaN(t)?e:t},vr=e=>{const t=Z(e)?Number(e):NaN;return isNaN(t)?e:t};let Wo;const gu=()=>Wo||(Wo=typeof globalThis<"u"?globalThis:typeof self<"u"?self:typeof window<"u"?window:typeof global<"u"?global:{});function Gf(e,t){return e+JSON.stringify(t,(u,n)=>typeof n=="function"?n.toString():n)}const Zf="Infinity,undefined,NaN,isFinite,isNaN,parseFloat,parseInt,decodeURI,decodeURIComponent,encodeURI,encodeURIComponent,Math,Number,Date,Array,Object,Boolean,String,RegExp,Map,Set,JSON,Intl,BigInt,console,Error,Symbol",Jf=Be(Zf);function un(e){if($(e)){const t={};for(let u=0;u<e.length;u++){const n=e[u],r=Z(n)?xl(n):un(n);if(r)for(const i in r)t[i]=r[i]}return t}else if(Z(e)||oe(e))return e}const Yf=/;(?![^(]*\))/g,Xf=/:([^]+)/,Qf=/\/\*[^]*?\*\//g;function xl(e){const t={};return e.replace(Qf,"").split(Yf).forEach(u=>{if(u){const n=u.split(Xf);n.length>1&&(t[n[0].trim()]=n[1].trim())}}),t}function ed(e){if(!e)return"";if(Z(e))return e;let t="";for(const u in e){const n=e[u];if(Z(n)||typeof n=="number"){const r=u.startsWith("--")?u:qe(u);t+=`${r}:${n};`}}return t}function nn(e){let t="";if(Z(e))t=e;else if($(e))for(let u=0;u<e.length;u++){const n=nn(e[u]);n&&(t+=n+" ")}else if(oe(e))for(const u in e)e[u]&&(t+=u+" ");return t.trim()}function td(e){if(!e)return null;let{class:t,style:u}=e;return t&&!Z(t)&&(e.class=nn(t)),u&&(e.style=un(u)),e}const ud="html,body,base,head,link,meta,style,title,address,article,aside,footer,header,hgroup,h1,h2,h3,h4,h5,h6,nav,section,div,dd,dl,dt,figcaption,figure,picture,hr,img,li,main,ol,p,pre,ul,a,b,abbr,bdi,bdo,br,cite,code,data,dfn,em,i,kbd,mark,q,rp,rt,ruby,s,samp,small,span,strong,sub,sup,time,u,var,wbr,area,audio,map,track,video,embed,object,param,source,canvas,script,noscript,del,ins,caption,col,colgroup,table,thead,tbody,td,th,tr,button,datalist,fieldset,form,input,label,legend,meter,optgroup,option,output,progress,select,textarea,details,dialog,menu,summary,template,blockquote,iframe,tfoot",nd="svg,animate,animateMotion,animateTransform,circle,clipPath,color-profile,defs,desc,discard,ellipse,feBlend,feColorMatrix,feComponentTransfer,feComposite,feConvolveMatrix,feDiffuseLighting,feDisplacementMap,feDistantLight,feDropShadow,feFlood,feFuncA,feFuncB,feFuncG,feFuncR,feGaussianBlur,feImage,feMerge,feMergeNode,feMorphology,feOffset,fePointLight,feSpecularLighting,feSpotLight,feTile,feTurbulence,filter,foreignObject,g,hatch,hatchpath,image,line,linearGradient,marker,mask,mesh,meshgradient,meshpatch,meshrow,metadata,mpath,path,pattern,polygon,polyline,radialGradient,rect,set,solidcolor,stop,switch,symbol,text,textPath,title,tspan,unknown,use,view",rd="annotation,annotation-xml,maction,maligngroup,malignmark,math,menclose,merror,mfenced,mfrac,mfraction,mglyph,mi,mlabeledtr,mlongdiv,mmultiscripts,mn,mo,mover,mpadded,mphantom,mprescripts,mroot,mrow,ms,mscarries,mscarry,msgroup,msline,mspace,msqrt,msrow,mstack,mstyle,msub,msubsup,msup,mtable,mtd,mtext,mtr,munder,munderover,none,semantics",id="area,base,br,col,embed,hr,img,input,link,meta,param,source,track,wbr",sd=Be(ud),od=Be(nd),cd=Be(rd),ld=Be(id),El="itemscope,allowfullscreen,formnovalidate,ismap,nomodule,novalidate,readonly",ad=Be(El),Go=Be(El+",async,autofocus,autoplay,controls,default,defer,disabled,hidden,inert,loop,open,required,reversed,scoped,seamless,checked,muted,multiple,selected");function Bs(e){return!!e||e===""}const fd=Be("accept,accept-charset,accesskey,action,align,allow,alt,async,autocapitalize,autocomplete,autofocus,autoplay,background,bgcolor,border,buffered,capture,challenge,charset,checked,cite,class,code,codebase,color,cols,colspan,content,contenteditable,contextmenu,controls,coords,crossorigin,csp,data,datetime,decoding,default,defer,dir,dirname,disabled,download,draggable,dropzone,enctype,enterkeyhint,for,form,formaction,formenctype,formmethod,formnovalidate,formtarget,headers,height,hidden,high,href,hreflang,http-equiv,icon,id,importance,inert,integrity,ismap,itemprop,keytype,kind,label,lang,language,loading,list,loop,low,manifest,max,maxlength,minlength,media,min,multiple,muted,name,novalidate,open,optimum,pattern,ping,placeholder,poster,preload,radiogroup,readonly,referrerpolicy,rel,required,reversed,rows,rowspan,sandbox,scope,scoped,selected,shape,size,sizes,slot,span,spellcheck,src,srcdoc,srclang,srcset,start,step,style,summary,tabindex,target,title,translate,type,usemap,value,width,wrap"),dd=Be("xmlns,accent-height,accumulate,additive,alignment-baseline,alphabetic,amplitude,arabic-form,ascent,attributeName,attributeType,azimuth,baseFrequency,baseline-shift,baseProfile,bbox,begin,bias,by,calcMode,cap-height,class,clip,clipPathUnits,clip-path,clip-rule,color,color-interpolation,color-interpolation-filters,color-profile,color-rendering,contentScriptType,contentStyleType,crossorigin,cursor,cx,cy,d,decelerate,descent,diffuseConstant,direction,display,divisor,dominant-baseline,dur,dx,dy,edgeMode,elevation,enable-background,end,exponent,fill,fill-opacity,fill-rule,filter,filterRes,filterUnits,flood-color,flood-opacity,font-family,font-size,font-size-adjust,font-stretch,font-style,font-variant,font-weight,format,from,fr,fx,fy,g1,g2,glyph-name,glyph-orientation-horizontal,glyph-orientation-vertical,glyphRef,gradientTransform,gradientUnits,hanging,height,href,hreflang,horiz-adv-x,horiz-origin-x,id,ideographic,image-rendering,in,in2,intercept,k,k1,k2,k3,k4,kernelMatrix,kernelUnitLength,kerning,keyPoints,keySplines,keyTimes,lang,lengthAdjust,letter-spacing,lighting-color,limitingConeAngle,local,marker-end,marker-mid,marker-start,markerHeight,markerUnits,markerWidth,mask,maskContentUnits,maskUnits,mathematical,max,media,method,min,mode,name,numOctaves,offset,opacity,operator,order,orient,orientation,origin,overflow,overline-position,overline-thickness,panose-1,paint-order,path,pathLength,patternContentUnits,patternTransform,patternUnits,ping,pointer-events,points,pointsAtX,pointsAtY,pointsAtZ,preserveAlpha,preserveAspectRatio,primitiveUnits,r,radius,referrerPolicy,refX,refY,rel,rendering-intent,repeatCount,repeatDur,requiredExtensions,requiredFeatures,restart,result,rotate,rx,ry,scale,seed,shape-rendering,slope,spacing,specularConstant,specularExponent,speed,spreadMethod,startOffset,stdDeviation,stemh,stemv,stitchTiles,stop-color,stop-opacity,strikethrough-position,strikethrough-thickness,string,stroke,stroke-dasharray,stroke-dashoffset,stroke-linecap,stroke-linejoin,stroke-miterlimit,stroke-opacity,stroke-width,style,surfaceScale,systemLanguage,tabindex,tableValues,target,targetX,targetY,text-anchor,text-decoration,text-rendering,textLength,to,transform,transform-origin,type,u1,u2,underline-position,underline-thickness,unicode,unicode-bidi,unicode-range,units-per-em,v-alphabetic,v-hanging,v-ideographic,v-mathematical,values,vector-effect,version,vert-adv-y,vert-origin-x,vert-origin-y,viewBox,viewTarget,visibility,width,widths,word-spacing,writing-mode,x,x-height,x1,x2,xChannelSelector,xlink:actuate,xlink:arcrole,xlink:href,xlink:role,xlink:show,xlink:title,xlink:type,xmlns:xlink,xml:base,xml:lang,xml:space,y,y1,y2,yChannelSelector,z,zoomAndPan");function hd(e){if(e==null)return!1;const t=typeof e;return t==="string"||t==="number"||t==="boolean"}const pd=/[ !"#$%&'()*+,./:;<=>?@[\\\]^`{|}~]/g;function md(e,t){return e.replace(pd,u=>`\\${u}`)}function bd(e,t){if(e.length!==t.length)return!1;let u=!0;for(let n=0;u&&n<e.length;n++)u=su(e[n],t[n]);return u}function su(e,t){if(e===t)return!0;let u=Ko(e),n=Ko(t);if(u||n)return u&&n?e.getTime()===t.getTime():!1;if(u=Ze(e),n=Ze(t),u||n)return e===t;if(u=$(e),n=$(t),u||n)return u&&n?bd(e,t):!1;if(u=oe(e),n=oe(t),u||n){if(!u||!n)return!1;const r=Object.keys(e).length,i=Object.keys(t).length;if(r!==i)return!1;for(const s in e){const o=e.hasOwnProperty(s),c=t.hasOwnProperty(s);if(o&&!c||!o&&c||!su(e[s],t[s]))return!1}}return String(e)===String(t)}function ni(e,t){return e.findIndex(u=>su(u,t))}const Cl=e=>!!(e&&e.__v_isRef===!0),kl=e=>Z(e)?e:e==null?"":$(e)||oe(e)&&(e.toString===yl||!W(e.toString))?Cl(e)?kl(e.value):JSON.stringify(e,Al,2):String(e),Al=(e,t)=>Cl(t)?Al(e,t.value):Lu(t)?{[`Map(${t.size})`]:[...t.entries()].reduce((u,[n,r],i)=>(u[Ni(n,i)+" =>"]=r,u),{})}:Du(t)?{[`Set(${t.size})`]:[...t.values()].map(u=>Ni(u))}:Ze(t)?Ni(t):oe(t)&&!$(t)&&!ti(t)?String(t):t,Ni=(e,t="")=>{var u;return Ze(e)?`Symbol(${(u=e.description)!=null?u:t})`:e};/**
-* @vue/reactivity v3.5.13
-* (c) 2018-present Yuxi (Evan) You and Vue contributors
-* @license MIT
-**/let $e;class Vs{constructor(t=!1){this.detached=t,this._active=!0,this.effects=[],this.cleanups=[],this._isPaused=!1,this.parent=$e,!t&&$e&&(this.index=($e.scopes||($e.scopes=[])).push(this)-1)}get active(){return this._active}pause(){if(this._active){this._isPaused=!0;let t,u;if(this.scopes)for(t=0,u=this.scopes.length;t<u;t++)this.scopes[t].pause();for(t=0,u=this.effects.length;t<u;t++)this.effects[t].pause()}}resume(){if(this._active&&this._isPaused){this._isPaused=!1;let t,u;if(this.scopes)for(t=0,u=this.scopes.length;t<u;t++)this.scopes[t].resume();for(t=0,u=this.effects.length;t<u;t++)this.effects[t].resume()}}run(t){if(this._active){const u=$e;try{return $e=this,t()}finally{$e=u}}}on(){$e=this}off(){$e=this.parent}stop(t){if(this._active){this._active=!1;let u,n;for(u=0,n=this.effects.length;u<n;u++)this.effects[u].stop();for(this.effects.length=0,u=0,n=this.cleanups.length;u<n;u++)this.cleanups[u]();if(this.cleanups.length=0,this.scopes){for(u=0,n=this.scopes.length;u<n;u++)this.scopes[u].stop(!0);this.scopes.length=0}if(!this.detached&&this.parent&&!t){const r=this.parent.scopes.pop();r&&r!==this&&(this.parent.scopes[this.index]=r,r.index=this.index)}this.parent=void 0}}}function gd(e){return new Vs(e)}function Sl(){return $e}function _d(e,t=!1){$e&&$e.cleanups.push(e)}let le;const Ri=new WeakSet;class vn{constructor(t){this.fn=t,this.deps=void 0,this.depsTail=void 0,this.flags=5,this.next=void 0,this.cleanup=void 0,this.scheduler=void 0,$e&&$e.active&&$e.effects.push(this)}pause(){this.flags|=64}resume(){this.flags&64&&(this.flags&=-65,Ri.has(this)&&(Ri.delete(this),this.trigger()))}notify(){this.flags&2&&!(this.flags&32)||this.flags&8||Dl(this)}run(){if(!(this.flags&1))return this.fn();this.flags|=2,Zo(this),Tl(this);const t=le,u=bt;le=this,bt=!0;try{return this.fn()}finally{wl(this),le=t,bt=u,this.flags&=-3}}stop(){if(this.flags&1){for(let t=this.deps;t;t=t.nextDep)Us(t);this.deps=this.depsTail=void 0,Zo(this),this.onStop&&this.onStop(),this.flags&=-2}}trigger(){this.flags&64?Ri.add(this):this.scheduler?this.scheduler():this.runIfDirty()}runIfDirty(){ts(this)&&this.run()}get dirty(){return ts(this)}}let vl=0,gn,_n;function Dl(e,t=!1){if(e.flags|=8,t){e.next=_n,_n=e;return}e.next=gn,gn=e}function $s(){vl++}function Hs(){if(--vl>0)return;if(_n){let t=_n;for(_n=void 0;t;){const u=t.next;t.next=void 0,t.flags&=-9,t=u}}let e;for(;gn;){let t=gn;for(gn=void 0;t;){const u=t.next;if(t.next=void 0,t.flags&=-9,t.flags&1)try{t.trigger()}catch(n){e||(e=n)}t=u}}if(e)throw e}function Tl(e){for(let t=e.deps;t;t=t.nextDep)t.version=-1,t.prevActiveLink=t.dep.activeLink,t.dep.activeLink=t}function wl(e){let t,u=e.depsTail,n=u;for(;n;){const r=n.prevDep;n.version===-1?(n===u&&(u=r),Us(n),yd(n)):t=n,n.dep.activeLink=n.prevActiveLink,n.prevActiveLink=void 0,n=r}e.deps=t,e.depsTail=u}function ts(e){for(let t=e.deps;t;t=t.nextDep)if(t.dep.version!==t.version||t.dep.computed&&(Fl(t.dep.computed)||t.dep.version!==t.version))return!0;return!!e._dirty}function Fl(e){if(e.flags&4&&!(e.flags&16)||(e.flags&=-17,e.globalVersion===Dn))return;e.globalVersion=Dn;const t=e.dep;if(e.flags|=2,t.version>0&&!e.isSSR&&e.deps&&!ts(e)){e.flags&=-3;return}const u=le,n=bt;le=e,bt=!0;try{Tl(e);const r=e.fn(e._value);(t.version===0||He(r,e._value))&&(e._value=r,t.version++)}catch(r){throw t.version++,r}finally{le=u,bt=n,wl(e),e.flags&=-3}}function Us(e,t=!1){const{dep:u,prevSub:n,nextSub:r}=e;if(n&&(n.nextSub=r,e.prevSub=void 0),r&&(r.prevSub=n,e.nextSub=void 0),u.subs===e&&(u.subs=n,!n&&u.computed)){u.computed.flags&=-5;for(let i=u.computed.deps;i;i=i.nextDep)Us(i,!0)}!t&&!--u.sc&&u.map&&u.map.delete(u.key)}function yd(e){const{prevDep:t,nextDep:u}=e;t&&(t.nextDep=u,e.prevDep=void 0),u&&(u.prevDep=t,e.nextDep=void 0)}function xd(e,t){e.effect instanceof vn&&(e=e.effect.fn);const u=new vn(e);t&&te(u,t);try{u.run()}catch(r){throw u.stop(),r}const n=u.run.bind(u);return n.effect=u,n}function Ed(e){e.effect.stop()}let bt=!0;const Il=[];function $t(){Il.push(bt),bt=!1}function Ht(){const e=Il.pop();bt=e===void 0?!0:e}function Zo(e){const{cleanup:t}=e;if(e.cleanup=void 0,t){const u=le;le=void 0;try{t()}finally{le=u}}}let Dn=0;class Cd{constructor(t,u){this.sub=t,this.dep=u,this.version=u.version,this.nextDep=this.prevDep=this.nextSub=this.prevSub=this.prevActiveLink=void 0}}class ri{constructor(t){this.computed=t,this.version=0,this.activeLink=void 0,this.subs=void 0,this.map=void 0,this.key=void 0,this.sc=0}track(t){if(!le||!bt||le===this.computed)return;let u=this.activeLink;if(u===void 0||u.sub!==le)u=this.activeLink=new Cd(le,this),le.deps?(u.prevDep=le.depsTail,le.depsTail.nextDep=u,le.depsTail=u):le.deps=le.depsTail=u,Ol(u);else if(u.version===-1&&(u.version=this.version,u.nextDep)){const n=u.nextDep;n.prevDep=u.prevDep,u.prevDep&&(u.prevDep.nextDep=n),u.prevDep=le.depsTail,u.nextDep=void 0,le.depsTail.nextDep=u,le.depsTail=u,le.deps===u&&(le.deps=n)}return u}trigger(t){this.version++,Dn++,this.notify(t)}notify(t){$s();try{for(let u=this.subs;u;u=u.prevSub)u.sub.notify()&&u.sub.dep.notify()}finally{Hs()}}}function Ol(e){if(e.dep.sc++,e.sub.flags&4){const t=e.dep.computed;if(t&&!e.dep.subs){t.flags|=20;for(let n=t.deps;n;n=n.nextDep)Ol(n)}const u=e.dep.subs;u!==e&&(e.prevSub=u,u&&(u.nextSub=e)),e.dep.subs=e}}const Dr=new WeakMap,_u=Symbol(""),us=Symbol(""),Tn=Symbol("");function Re(e,t,u){if(bt&&le){let n=Dr.get(e);n||Dr.set(e,n=new Map);let r=n.get(u);r||(n.set(u,r=new ri),r.map=n,r.key=u),r.track()}}function Nt(e,t,u,n,r,i){const s=Dr.get(e);if(!s){Dn++;return}const o=c=>{c&&c.trigger()};if($s(),t==="clear")s.forEach(o);else{const c=$(e),l=c&&Ls(u);if(c&&u==="length"){const a=Number(n);s.forEach((f,p)=>{(p==="length"||p===Tn||!Ze(p)&&p>=a)&&o(f)})}else switch((u!==void 0||s.has(void 0))&&o(s.get(u)),l&&o(s.get(Tn)),t){case"add":c?l&&o(s.get("length")):(o(s.get(_u)),Lu(e)&&o(s.get(us)));break;case"delete":c||(o(s.get(_u)),Lu(e)&&o(s.get(us)));break;case"set":Lu(e)&&o(s.get(_u));break}}Hs()}function kd(e,t){const u=Dr.get(e);return u&&u.get(t)}function Ou(e){const t=Q(e);return t===e?t:(Re(t,"iterate",Tn),ut(e)?t:t.map(Me))}function ii(e){return Re(e=Q(e),"iterate",Tn),e}const Ad={__proto__:null,[Symbol.iterator](){return Mi(this,Symbol.iterator,Me)},concat(...e){return Ou(this).concat(...e.map(t=>$(t)?Ou(t):t))},entries(){return Mi(this,"entries",e=>(e[1]=Me(e[1]),e))},every(e,t){return wt(this,"every",e,t,void 0,arguments)},filter(e,t){return wt(this,"filter",e,t,u=>u.map(Me),arguments)},find(e,t){return wt(this,"find",e,t,Me,arguments)},findIndex(e,t){return wt(this,"findIndex",e,t,void 0,arguments)},findLast(e,t){return wt(this,"findLast",e,t,Me,arguments)},findLastIndex(e,t){return wt(this,"findLastIndex",e,t,void 0,arguments)},forEach(e,t){return wt(this,"forEach",e,t,void 0,arguments)},includes(...e){return Pi(this,"includes",e)},indexOf(...e){return Pi(this,"indexOf",e)},join(e){return Ou(this).join(e)},lastIndexOf(...e){return Pi(this,"lastIndexOf",e)},map(e,t){return wt(this,"map",e,t,void 0,arguments)},pop(){return cn(this,"pop")},push(...e){return cn(this,"push",e)},reduce(e,...t){return Jo(this,"reduce",e,t)},reduceRight(e,...t){return Jo(this,"reduceRight",e,t)},shift(){return cn(this,"shift")},some(e,t){return wt(this,"some",e,t,void 0,arguments)},splice(...e){return cn(this,"splice",e)},toReversed(){return Ou(this).toReversed()},toSorted(e){return Ou(this).toSorted(e)},toSpliced(...e){return Ou(this).toSpliced(...e)},unshift(...e){return cn(this,"unshift",e)},values(){return Mi(this,"values",Me)}};function Mi(e,t,u){const n=ii(e),r=n[t]();return n!==e&&!ut(e)&&(r._next=r.next,r.next=()=>{const i=r._next();return i.value&&(i.value=u(i.value)),i}),r}const Sd=Array.prototype;function wt(e,t,u,n,r,i){const s=ii(e),o=s!==e&&!ut(e),c=s[t];if(c!==Sd[t]){const f=c.apply(e,i);return o?Me(f):f}let l=u;s!==e&&(o?l=function(f,p){return u.call(this,Me(f),p,e)}:u.length>2&&(l=function(f,p){return u.call(this,f,p,e)}));const a=c.call(s,l,n);return o&&r?r(a):a}function Jo(e,t,u,n){const r=ii(e);let i=u;return r!==e&&(ut(e)?u.length>3&&(i=function(s,o,c){return u.call(this,s,o,c,e)}):i=function(s,o,c){return u.call(this,s,Me(o),c,e)}),r[t](i,...n)}function Pi(e,t,u){const n=Q(e);Re(n,"iterate",Tn);const r=n[t](...u);return(r===-1||r===!1)&&li(u[0])?(u[0]=Q(u[0]),n[t](...u)):r}function cn(e,t,u=[]){$t(),$s();const n=Q(e)[t].apply(e,u);return Hs(),Ht(),n}const vd=Be("__proto__,__v_isRef,__isVue"),Nl=new Set(Object.getOwnPropertyNames(Symbol).filter(e=>e!=="arguments"&&e!=="caller").map(e=>Symbol[e]).filter(Ze));function Dd(e){Ze(e)||(e=String(e));const t=Q(this);return Re(t,"has",e),t.hasOwnProperty(e)}class Rl{constructor(t=!1,u=!1){this._isReadonly=t,this._isShallow=u}get(t,u,n){if(u==="__v_skip")return t.__v_skip;const r=this._isReadonly,i=this._isShallow;if(u==="__v_isReactive")return!r;if(u==="__v_isReadonly")return r;if(u==="__v_isShallow")return i;if(u==="__v_raw")return n===(r?i?$l:Vl:i?Bl:Ll).get(t)||Object.getPrototypeOf(t)===Object.getPrototypeOf(n)?t:void 0;const s=$(t);if(!r){let c;if(s&&(c=Ad[u]))return c;if(u==="hasOwnProperty")return Dd}const o=Reflect.get(t,u,Ce(t)?t:n);return(Ze(u)?Nl.has(u):vd(u))||(r||Re(t,"get",u),i)?o:Ce(o)?s&&Ls(u)?o:o.value:oe(o)?r?qs(o):oi(o):o}}class Ml extends Rl{constructor(t=!1){super(!1,t)}set(t,u,n,r){let i=t[u];if(!this._isShallow){const c=ou(i);if(!ut(n)&&!ou(n)&&(i=Q(i),n=Q(n)),!$(t)&&Ce(i)&&!Ce(n))return c?!1:(i.value=n,!0)}const s=$(t)&&Ls(u)?Number(u)<t.length:re(t,u),o=Reflect.set(t,u,n,Ce(t)?t:r);return t===Q(r)&&(s?He(n,i)&&Nt(t,"set",u,n):Nt(t,"add",u,n)),o}deleteProperty(t,u){const n=re(t,u);t[u];const r=Reflect.deleteProperty(t,u);return r&&n&&Nt(t,"delete",u,void 0),r}has(t,u){const n=Reflect.has(t,u);return(!Ze(u)||!Nl.has(u))&&Re(t,"has",u),n}ownKeys(t){return Re(t,"iterate",$(t)?"length":_u),Reflect.ownKeys(t)}}class Pl extends Rl{constructor(t=!1){super(!0,t)}set(t,u){return!0}deleteProperty(t,u){return!0}}const Td=new Ml,wd=new Pl,Fd=new Ml(!0),Id=new Pl(!0),ns=e=>e,rr=e=>Reflect.getPrototypeOf(e);function Od(e,t,u){return function(...n){const r=this.__v_raw,i=Q(r),s=Lu(i),o=e==="entries"||e===Symbol.iterator&&s,c=e==="keys"&&s,l=r[e](...n),a=u?ns:t?rs:Me;return!t&&Re(i,"iterate",c?us:_u),{next(){const{value:f,done:p}=l.next();return p?{value:f,done:p}:{value:o?[a(f[0]),a(f[1])]:a(f),done:p}},[Symbol.iterator](){return this}}}}function ir(e){return function(...t){return e==="delete"?!1:e==="clear"?void 0:this}}function Nd(e,t){const u={get(r){const i=this.__v_raw,s=Q(i),o=Q(r);e||(He(r,o)&&Re(s,"get",r),Re(s,"get",o));const{has:c}=rr(s),l=t?ns:e?rs:Me;if(c.call(s,r))return l(i.get(r));if(c.call(s,o))return l(i.get(o));i!==s&&i.get(r)},get size(){const r=this.__v_raw;return!e&&Re(Q(r),"iterate",_u),Reflect.get(r,"size",r)},has(r){const i=this.__v_raw,s=Q(i),o=Q(r);return e||(He(r,o)&&Re(s,"has",r),Re(s,"has",o)),r===o?i.has(r):i.has(r)||i.has(o)},forEach(r,i){const s=this,o=s.__v_raw,c=Q(o),l=t?ns:e?rs:Me;return!e&&Re(c,"iterate",_u),o.forEach((a,f)=>r.call(i,l(a),l(f),s))}};return te(u,e?{add:ir("add"),set:ir("set"),delete:ir("delete"),clear:ir("clear")}:{add(r){!t&&!ut(r)&&!ou(r)&&(r=Q(r));const i=Q(this);return rr(i).has.call(i,r)||(i.add(r),Nt(i,"add",r,r)),this},set(r,i){!t&&!ut(i)&&!ou(i)&&(i=Q(i));const s=Q(this),{has:o,get:c}=rr(s);let l=o.call(s,r);l||(r=Q(r),l=o.call(s,r));const a=c.call(s,r);return s.set(r,i),l?He(i,a)&&Nt(s,"set",r,i):Nt(s,"add",r,i),this},delete(r){const i=Q(this),{has:s,get:o}=rr(i);let c=s.call(i,r);c||(r=Q(r),c=s.call(i,r)),o&&o.call(i,r);const l=i.delete(r);return c&&Nt(i,"delete",r,void 0),l},clear(){const r=Q(this),i=r.size!==0,s=r.clear();return i&&Nt(r,"clear",void 0,void 0),s}}),["keys","values","entries",Symbol.iterator].forEach(r=>{u[r]=Od(r,e,t)}),u}function si(e,t){const u=Nd(e,t);return(n,r,i)=>r==="__v_isReactive"?!e:r==="__v_isReadonly"?e:r==="__v_raw"?n:Reflect.get(re(u,r)&&r in n?u:n,r,i)}const Rd={get:si(!1,!1)},Md={get:si(!1,!0)},Pd={get:si(!0,!1)},Ld={get:si(!0,!0)},Ll=new WeakMap,Bl=new WeakMap,Vl=new WeakMap,$l=new WeakMap;function Bd(e){switch(e){case"Object":case"Array":return 1;case"Map":case"Set":case"WeakMap":case"WeakSet":return 2;default:return 0}}function Vd(e){return e.__v_skip||!Object.isExtensible(e)?0:Bd(jf(e))}function oi(e){return ou(e)?e:ci(e,!1,Td,Rd,Ll)}function Hl(e){return ci(e,!1,Fd,Md,Bl)}function qs(e){return ci(e,!0,wd,Pd,Vl)}function $d(e){return ci(e,!0,Id,Ld,$l)}function ci(e,t,u,n,r){if(!oe(e)||e.__v_raw&&!(t&&e.__v_isReactive))return e;const i=r.get(e);if(i)return i;const s=Vd(e);if(s===0)return e;const o=new Proxy(e,s===2?n:u);return r.set(e,o),o}function uu(e){return ou(e)?uu(e.__v_raw):!!(e&&e.__v_isReactive)}function ou(e){return!!(e&&e.__v_isReadonly)}function ut(e){return!!(e&&e.__v_isShallow)}function li(e){return e?!!e.__v_raw:!1}function Q(e){const t=e&&e.__v_raw;return t?Q(t):e}function Ul(e){return!re(e,"__v_skip")&&Object.isExtensible(e)&&ju(e,"__v_skip",!0),e}const Me=e=>oe(e)?oi(e):e,rs=e=>oe(e)?qs(e):e;function Ce(e){return e?e.__v_isRef===!0:!1}function yn(e){return ql(e,!1)}function js(e){return ql(e,!0)}function ql(e,t){return Ce(e)?e:new Hd(e,t)}class Hd{constructor(t,u){this.dep=new ri,this.__v_isRef=!0,this.__v_isShallow=!1,this._rawValue=u?t:Q(t),this._value=u?t:Me(t),this.__v_isShallow=u}get value(){return this.dep.track(),this._value}set value(t){const u=this._rawValue,n=this.__v_isShallow||ut(t)||ou(t);t=n?t:Q(t),He(t,u)&&(this._rawValue=t,this._value=n?t:Me(t),this.dep.trigger())}}function Ud(e){e.dep&&e.dep.trigger()}function ai(e){return Ce(e)?e.value:e}function qd(e){return W(e)?e():ai(e)}const jd={get:(e,t,u)=>t==="__v_raw"?e:ai(Reflect.get(e,t,u)),set:(e,t,u,n)=>{const r=e[t];return Ce(r)&&!Ce(u)?(r.value=u,!0):Reflect.set(e,t,u,n)}};function zs(e){return uu(e)?e:new Proxy(e,jd)}class zd{constructor(t){this.__v_isRef=!0,this._value=void 0;const u=this.dep=new ri,{get:n,set:r}=t(u.track.bind(u),u.trigger.bind(u));this._get=n,this._set=r}get value(){return this._value=this._get()}set value(t){this._set(t)}}function jl(e){return new zd(e)}function Kd(e){const t=$(e)?new Array(e.length):{};for(const u in e)t[u]=zl(e,u);return t}class Wd{constructor(t,u,n){this._object=t,this._key=u,this._defaultValue=n,this.__v_isRef=!0,this._value=void 0}get value(){const t=this._object[this._key];return this._value=t===void 0?this._defaultValue:t}set value(t){this._object[this._key]=t}get dep(){return kd(Q(this._object),this._key)}}class Gd{constructor(t){this._getter=t,this.__v_isRef=!0,this.__v_isReadonly=!0,this._value=void 0}get value(){return this._value=this._getter()}}function Zd(e,t,u){return Ce(e)?e:W(e)?new Gd(e):oe(e)&&arguments.length>1?zl(e,t,u):yn(e)}function zl(e,t,u){const n=e[t];return Ce(n)?n:new Wd(e,t,u)}class Jd{constructor(t,u,n){this.fn=t,this.setter=u,this._value=void 0,this.dep=new ri(this),this.__v_isRef=!0,this.deps=void 0,this.depsTail=void 0,this.flags=16,this.globalVersion=Dn-1,this.next=void 0,this.effect=this,this.__v_isReadonly=!u,this.isSSR=n}notify(){if(this.flags|=16,!(this.flags&8)&&le!==this)return Dl(this,!0),!0}get value(){const t=this.dep.track();return Fl(this),t&&(t.version=this.dep.version),this._value}set value(t){this.setter&&this.setter(t)}}function Yd(e,t,u=!1){let n,r;return W(e)?n=e:(n=e.get,r=e.set),new Jd(n,r,u)}const Xd={GET:"get",HAS:"has",ITERATE:"iterate"},Qd={SET:"set",ADD:"add",DELETE:"delete",CLEAR:"clear"},sr={},Tr=new WeakMap;let Wt;function eh(){return Wt}function Kl(e,t=!1,u=Wt){if(u){let n=Tr.get(u);n||Tr.set(u,n=[]),n.push(e)}}function th(e,t,u=ee){const{immediate:n,deep:r,once:i,scheduler:s,augmentJob:o,call:c}=u,l=g=>r?g:ut(g)||r===!1||r===0?Rt(g,1):Rt(g);let a,f,p,d,h=!1,_=!1;if(Ce(e)?(f=()=>e.value,h=ut(e)):uu(e)?(f=()=>l(e),h=!0):$(e)?(_=!0,h=e.some(g=>uu(g)||ut(g)),f=()=>e.map(g=>{if(Ce(g))return g.value;if(uu(g))return l(g);if(W(g))return c?c(g,2):g()})):W(e)?t?f=c?()=>c(e,2):e:f=()=>{if(p){$t();try{p()}finally{Ht()}}const g=Wt;Wt=a;try{return c?c(e,3,[d]):e(d)}finally{Wt=g}}:f=we,t&&r){const g=f,x=r===!0?1/0:r;f=()=>Rt(g(),x)}const v=Sl(),A=()=>{a.stop(),v&&v.active&&Ms(v.effects,a)};if(i&&t){const g=t;t=(...x)=>{g(...x),A()}}let E=_?new Array(e.length).fill(sr):sr;const m=g=>{if(!(!(a.flags&1)||!a.dirty&&!g))if(t){const x=a.run();if(r||h||(_?x.some((k,I)=>He(k,E[I])):He(x,E))){p&&p();const k=Wt;Wt=a;try{const I=[x,E===sr?void 0:_&&E[0]===sr?[]:E,d];c?c(t,3,I):t(...I),E=x}finally{Wt=k}}}else a.run()};return o&&o(m),a=new vn(f),a.scheduler=s?()=>s(m,!1):m,d=g=>Kl(g,!1,a),p=a.onStop=()=>{const g=Tr.get(a);if(g){if(c)c(g,4);else for(const x of g)x();Tr.delete(a)}},t?n?m(!0):E=a.run():s?s(m.bind(null,!0),!0):a.run(),A.pause=a.pause.bind(a),A.resume=a.resume.bind(a),A.stop=A,A}function Rt(e,t=1/0,u){if(t<=0||!oe(e)||e.__v_skip||(u=u||new Set,u.has(e)))return e;if(u.add(e),t--,Ce(e))Rt(e.value,t,u);else if($(e))for(let n=0;n<e.length;n++)Rt(e[n],t,u);else if(Du(e)||Lu(e))e.forEach(n=>{Rt(n,t,u)});else if(ti(e)){for(const n in e)Rt(e[n],t,u);for(const n of Object.getOwnPropertySymbols(e))Object.prototype.propertyIsEnumerable.call(e,n)&&Rt(e[n],t,u)}return e}/**
-* @vue/runtime-core v3.5.13
-* (c) 2018-present Yuxi (Evan) You and Vue contributors
-* @license MIT
-**/const yu=[];function uh(e){yu.push(e)}function nh(){yu.pop()}let Li=!1;function zt(e,...t){if(Li)return;Li=!0,$t();const u=yu.length?yu[yu.length-1].component:null,n=u&&u.appContext.config.warnHandler,r=rh();if(n)wu(n,u,11,[e+t.map(i=>{var s,o;return(o=(s=i.toString)==null?void 0:s.call(i))!=null?o:JSON.stringify(i)}).join(""),u&&u.proxy,r.map(({vnode:i})=>`at <${e0(u,i.type)}>`).join(`
-`),r]);else{const i=[`[Vue warn]: ${e}`,...t];r.length&&i.push(`
-`,...ih(r)),console.warn(...i)}Ht(),Li=!1}function rh(){let e=yu[yu.length-1];if(!e)return[];const t=[];for(;e;){const u=t[0];u&&u.vnode===e?u.recurseCount++:t.push({vnode:e,recurseCount:0});const n=e.component&&e.component.parent;e=n&&n.vnode}return t}function ih(e){const t=[];return e.forEach((u,n)=>{t.push(...n===0?[]:[`
-`],...sh(u))}),t}function sh({vnode:e,recurseCount:t}){const u=t>0?`... (${t} recursive calls)`:"",n=e.component?e.component.parent==null:!1,r=` at <${e0(e.component,e.type,n)}`,i=">"+u;return e.props?[r,...oh(e.props),i]:[r+i]}function oh(e){const t=[],u=Object.keys(e);return u.slice(0,3).forEach(n=>{t.push(...Wl(n,e[n]))}),u.length>3&&t.push(" ..."),t}function Wl(e,t,u){return Z(t)?(t=JSON.stringify(t),u?t:[`${e}=${t}`]):typeof t=="number"||typeof t=="boolean"||t==null?u?t:[`${e}=${t}`]:Ce(t)?(t=Wl(e,Q(t.value),!0),u?t:[`${e}=Ref<`,t,">"]):W(t)?[`${e}=fn${t.name?`<${t.name}>`:""}`]:(t=Q(t),u?t:[`${e}=`,t])}function ch(e,t){}const lh={SETUP_FUNCTION:0,0:"SETUP_FUNCTION",RENDER_FUNCTION:1,1:"RENDER_FUNCTION",NATIVE_EVENT_HANDLER:5,5:"NATIVE_EVENT_HANDLER",COMPONENT_EVENT_HANDLER:6,6:"COMPONENT_EVENT_HANDLER",VNODE_HOOK:7,7:"VNODE_HOOK",DIRECTIVE_HOOK:8,8:"DIRECTIVE_HOOK",TRANSITION_HOOK:9,9:"TRANSITION_HOOK",APP_ERROR_HANDLER:10,10:"APP_ERROR_HANDLER",APP_WARN_HANDLER:11,11:"APP_WARN_HANDLER",FUNCTION_REF:12,12:"FUNCTION_REF",ASYNC_COMPONENT_LOADER:13,13:"ASYNC_COMPONENT_LOADER",SCHEDULER:14,14:"SCHEDULER",COMPONENT_UPDATE:15,15:"COMPONENT_UPDATE",APP_UNMOUNT_CLEANUP:16,16:"APP_UNMOUNT_CLEANUP"},ah={sp:"serverPrefetch hook",bc:"beforeCreate hook",c:"created hook",bm:"beforeMount hook",m:"mounted hook",bu:"beforeUpdate hook",u:"updated",bum:"beforeUnmount hook",um:"unmounted hook",a:"activated hook",da:"deactivated hook",ec:"errorCaptured hook",rtc:"renderTracked hook",rtg:"renderTriggered hook",0:"setup function",1:"render function",2:"watcher getter",3:"watcher callback",4:"watcher cleanup function",5:"native event handler",6:"component event handler",7:"vnode hook",8:"directive hook",9:"transition hook",10:"app errorHandler",11:"app warnHandler",12:"ref function",13:"async component loader",14:"scheduler flush",15:"component update",16:"app unmount cleanup function"};function wu(e,t,u,n){try{return n?e(...n):e()}catch(r){Fu(r,t,u)}}function lt(e,t,u,n){if(W(e)){const r=wu(e,t,u,n);return r&&Ps(r)&&r.catch(i=>{Fu(i,t,u)}),r}if($(e)){const r=[];for(let i=0;i<e.length;i++)r.push(lt(e[i],t,u,n));return r}}function Fu(e,t,u,n=!0){const r=t?t.vnode:null,{errorHandler:i,throwUnhandledErrorInProduction:s}=t&&t.appContext.config||ee;if(t){let o=t.parent;const c=t.proxy,l=`https://vuejs.org/error-reference/#runtime-${u}`;for(;o;){const a=o.ec;if(a){for(let f=0;f<a.length;f++)if(a[f](e,c,l)===!1)return}o=o.parent}if(i){$t(),wu(i,null,10,[e,c,l]),Ht();return}}fh(e,u,r,n,s)}function fh(e,t,u,n=!0,r=!1){if(r)throw e;console.error(e)}const Ue=[];let Ct=-1;const $u=[];let Gt=null,Mu=0;const Gl=Promise.resolve();let wr=null;function fi(e){const t=wr||Gl;return e?t.then(this?e.bind(this):e):t}function dh(e){let t=Ct+1,u=Ue.length;for(;t<u;){const n=t+u>>>1,r=Ue[n],i=Fn(r);i<e||i===e&&r.flags&2?t=n+1:u=n}return t}function Ks(e){if(!(e.flags&1)){const t=Fn(e),u=Ue[Ue.length-1];!u||!(e.flags&2)&&t>=Fn(u)?Ue.push(e):Ue.splice(dh(t),0,e),e.flags|=1,Zl()}}function Zl(){wr||(wr=Gl.then(Jl))}function wn(e){$(e)?$u.push(...e):Gt&&e.id===-1?Gt.splice(Mu+1,0,e):e.flags&1||($u.push(e),e.flags|=1),Zl()}function Yo(e,t,u=Ct+1){for(;u<Ue.length;u++){const n=Ue[u];if(n&&n.flags&2){if(e&&n.id!==e.uid)continue;Ue.splice(u,1),u--,n.flags&4&&(n.flags&=-2),n(),n.flags&4||(n.flags&=-2)}}}function Fr(e){if($u.length){const t=[...new Set($u)].sort((u,n)=>Fn(u)-Fn(n));if($u.length=0,Gt){Gt.push(...t);return}for(Gt=t,Mu=0;Mu<Gt.length;Mu++){const u=Gt[Mu];u.flags&4&&(u.flags&=-2),u.flags&8||u(),u.flags&=-2}Gt=null,Mu=0}}const Fn=e=>e.id==null?e.flags&2?-1:1/0:e.id;function Jl(e){try{for(Ct=0;Ct<Ue.length;Ct++){const t=Ue[Ct];t&&!(t.flags&8)&&(t.flags&4&&(t.flags&=-2),wu(t,t.i,t.i?15:14),t.flags&4||(t.flags&=-2))}}finally{for(;Ct<Ue.length;Ct++){const t=Ue[Ct];t&&(t.flags&=-2)}Ct=-1,Ue.length=0,Fr(),wr=null,(Ue.length||$u.length)&&Jl()}}let pt,hn=[],is=!1;function di(e,...t){pt?pt.emit(e,...t):is||hn.push({event:e,args:t})}function Ws(e,t){var u,n;pt=e,pt?(pt.enabled=!0,hn.forEach(({event:r,args:i})=>pt.emit(r,...i)),hn=[]):typeof window<"u"&&window.HTMLElement&&!((n=(u=window.navigator)==null?void 0:u.userAgent)!=null&&n.includes("jsdom"))?((t.__VUE_DEVTOOLS_HOOK_REPLAY__=t.__VUE_DEVTOOLS_HOOK_REPLAY__||[]).push(i=>{Ws(i,t)}),setTimeout(()=>{pt||(t.__VUE_DEVTOOLS_HOOK_REPLAY__=null,is=!0,hn=[])},3e3)):(is=!0,hn=[])}function hh(e,t){di("app:init",e,t,{Fragment:Ee,Text:Pt,Comment:ye,Static:ru})}function ph(e){di("app:unmount",e)}const ss=Gs("component:added"),Yl=Gs("component:updated"),mh=Gs("component:removed"),bh=e=>{pt&&typeof pt.cleanupBuffer=="function"&&!pt.cleanupBuffer(e)&&mh(e)};/*! #__NO_SIDE_EFFECTS__ */function Gs(e){return t=>{di(e,t.appContext.app,t.uid,t.parent?t.parent.uid:void 0,t)}}function gh(e,t,u){di("component:emit",e.appContext.app,e,t,u)}let ve=null,hi=null;function In(e){const t=ve;return ve=e,hi=e&&e.type.__scopeId||null,t}function _h(e){hi=e}function yh(){hi=null}const xh=e=>Zs;function Zs(e,t=ve,u){if(!t||e._n)return e;const n=(...r)=>{n._d&&ps(-1);const i=In(t);let s;try{s=e(...r)}finally{In(i),n._d&&ps(1)}return __VUE_PROD_DEVTOOLS__&&Yl(t),s};return n._n=!0,n._c=!0,n._d=!0,n}function Eh(e,t){if(ve===null)return e;const u=Jn(ve),n=e.dirs||(e.dirs=[]);for(let r=0;r<t.length;r++){let[i,s,o,c=ee]=t[r];i&&(W(i)&&(i={mounted:i,updated:i}),i.deep&&Rt(s),n.push({dir:i,instance:u,value:s,oldValue:void 0,arg:o,modifiers:c}))}return e}function kt(e,t,u,n){const r=e.dirs,i=t&&t.dirs;for(let s=0;s<r.length;s++){const o=r[s];i&&(o.oldValue=i[s].value);let c=o.dir[n];c&&($t(),lt(c,u,8,[e.el,o,e,t]),Ht())}}const Xl=Symbol("_vte"),Ql=e=>e.__isTeleport,xn=e=>e&&(e.disabled||e.disabled===""),Xo=e=>e&&(e.defer||e.defer===""),Qo=e=>typeof SVGElement<"u"&&e instanceof SVGElement,ec=e=>typeof MathMLElement=="function"&&e instanceof MathMLElement,os=(e,t)=>{const u=e&&e.to;return Z(u)?t?t(u):null:u},ea={name:"Teleport",__isTeleport:!0,process(e,t,u,n,r,i,s,o,c,l){const{mc:a,pc:f,pbc:p,o:{insert:d,querySelector:h,createText:_,createComment:v}}=l,A=xn(t.props);let{shapeFlag:E,children:m,dynamicChildren:g}=t;if(e==null){const x=t.el=_(""),k=t.anchor=_("");d(x,u,n),d(k,u,n);const I=(C,S)=>{E&16&&(r&&r.isCE&&(r.ce._teleportTarget=C),a(m,C,S,r,i,s,o,c))},w=()=>{const C=t.target=os(t.props,h),S=ta(C,t,_,d);C&&(s!=="svg"&&Qo(C)?s="svg":s!=="mathml"&&ec(C)&&(s="mathml"),A||(I(C,S),gr(t,!1)))};A&&(I(u,k),gr(t,!0)),Xo(t.props)?ke(()=>{w(),t.el.__isMounted=!0},i):w()}else{if(Xo(t.props)&&!e.el.__isMounted){ke(()=>{ea.process(e,t,u,n,r,i,s,o,c,l),delete e.el.__isMounted},i);return}t.el=e.el,t.targetStart=e.targetStart;const x=t.anchor=e.anchor,k=t.target=e.target,I=t.targetAnchor=e.targetAnchor,w=xn(e.props),C=w?u:k,S=w?x:I;if(s==="svg"||Qo(k)?s="svg":(s==="mathml"||ec(k))&&(s="mathml"),g?(p(e.dynamicChildren,g,C,r,i,s,o),io(e,t,!0)):c||f(e,t,C,S,r,i,s,o,!1),A)w?t.props&&e.props&&t.props.to!==e.props.to&&(t.props.to=e.props.to):or(t,u,x,l,1);else if((t.props&&t.props.to)!==(e.props&&e.props.to)){const R=t.target=os(t.props,h);R&&or(t,R,null,l,0)}else w&&or(t,k,I,l,1);gr(t,A)}},remove(e,t,u,{um:n,o:{remove:r}},i){const{shapeFlag:s,children:o,anchor:c,targetStart:l,targetAnchor:a,target:f,props:p}=e;if(f&&(r(l),r(a)),i&&r(c),s&16){const d=i||!xn(p);for(let h=0;h<o.length;h++){const _=o[h];n(_,t,u,d,!!_.dynamicChildren)}}},move:or,hydrate:Ch};function or(e,t,u,{o:{insert:n},m:r},i=2){i===0&&n(e.targetAnchor,t,u);const{el:s,anchor:o,shapeFlag:c,children:l,props:a}=e,f=i===2;if(f&&n(s,t,u),(!f||xn(a))&&c&16)for(let p=0;p<l.length;p++)r(l[p],t,u,2);f&&n(o,t,u)}function Ch(e,t,u,n,r,i,{o:{nextSibling:s,parentNode:o,querySelector:c,insert:l,createText:a}},f){const p=t.target=os(t.props,c);if(p){const d=xn(t.props),h=p._lpa||p.firstChild;if(t.shapeFlag&16)if(d)t.anchor=f(s(e),t,o(e),u,n,r,i),t.targetStart=h,t.targetAnchor=h&&s(h);else{t.anchor=s(e);let _=h;for(;_;){if(_&&_.nodeType===8){if(_.data==="teleport start anchor")t.targetStart=_;else if(_.data==="teleport anchor"){t.targetAnchor=_,p._lpa=t.targetAnchor&&s(t.targetAnchor);break}}_=s(_)}t.targetAnchor||ta(p,t,a,l),f(h&&s(h),t,p,u,n,r,i)}gr(t,d)}return t.anchor&&s(t.anchor)}const kh=ea;function gr(e,t){const u=e.ctx;if(u&&u.ut){let n,r;for(t?(n=e.el,r=e.anchor):(n=e.targetStart,r=e.targetAnchor);n&&n!==r;)n.nodeType===1&&n.setAttribute("data-v-owner",u.uid),n=n.nextSibling;u.ut()}}function ta(e,t,u,n){const r=t.targetStart=u(""),i=t.targetAnchor=u("");return r[Xl]=i,e&&(n(r,e),n(i,e)),i}const Zt=Symbol("_leaveCb"),cr=Symbol("_enterCb");function Js(){const e={isMounted:!1,isLeaving:!1,isUnmounting:!1,leavingVNodes:new Map};return Gn(()=>{e.isMounted=!0}),gi(()=>{e.isUnmounting=!0}),e}const it=[Function,Array],Ys={mode:String,appear:Boolean,persisted:Boolean,onBeforeEnter:it,onEnter:it,onAfterEnter:it,onEnterCancelled:it,onBeforeLeave:it,onLeave:it,onAfterLeave:it,onLeaveCancelled:it,onBeforeAppear:it,onAppear:it,onAfterAppear:it,onAppearCancelled:it},ua=e=>{const t=e.subTree;return t.component?ua(t.component):t},Ah={name:"BaseTransition",props:Ys,setup(e,{slots:t}){const u=ft(),n=Js();return()=>{const r=t.default&&pi(t.default(),!0);if(!r||!r.length)return;const i=na(r),s=Q(e),{mode:o}=s;if(n.isLeaving)return Bi(i);const c=tc(i);if(!c)return Bi(i);let l=zu(c,s,n,u,f=>l=f);c.type!==ye&&Bt(c,l);let a=u.subTree&&tc(u.subTree);if(a&&a.type!==ye&&!mt(c,a)&&ua(u).type!==ye){let f=zu(a,s,n,u);if(Bt(a,f),o==="out-in"&&c.type!==ye)return n.isLeaving=!0,f.afterLeave=()=>{n.isLeaving=!1,u.job.flags&8||u.update(),delete f.afterLeave,a=void 0},Bi(i);o==="in-out"&&c.type!==ye?f.delayLeave=(p,d,h)=>{const _=ia(n,a);_[String(a.key)]=a,p[Zt]=()=>{d(),p[Zt]=void 0,delete l.delayedLeave,a=void 0},l.delayedLeave=()=>{h(),delete l.delayedLeave,a=void 0}}:a=void 0}else a&&(a=void 0);return i}}};function na(e){let t=e[0];if(e.length>1){for(const u of e)if(u.type!==ye){t=u;break}}return t}const ra=Ah;function ia(e,t){const{leavingVNodes:u}=e;let n=u.get(t.type);return n||(n=Object.create(null),u.set(t.type,n)),n}function zu(e,t,u,n,r){const{appear:i,mode:s,persisted:o=!1,onBeforeEnter:c,onEnter:l,onAfterEnter:a,onEnterCancelled:f,onBeforeLeave:p,onLeave:d,onAfterLeave:h,onLeaveCancelled:_,onBeforeAppear:v,onAppear:A,onAfterAppear:E,onAppearCancelled:m}=t,g=String(e.key),x=ia(u,e),k=(C,S)=>{C&&lt(C,n,9,S)},I=(C,S)=>{const R=S[1];k(C,S),$(C)?C.every(T=>T.length<=1)&&R():C.length<=1&&R()},w={mode:s,persisted:o,beforeEnter(C){let S=c;if(!u.isMounted)if(i)S=v||c;else return;C[Zt]&&C[Zt](!0);const R=x[g];R&&mt(e,R)&&R.el[Zt]&&R.el[Zt](),k(S,[C])},enter(C){let S=l,R=a,T=f;if(!u.isMounted)if(i)S=A||l,R=E||a,T=m||f;else return;let P=!1;const q=C[cr]=z=>{P||(P=!0,z?k(T,[C]):k(R,[C]),w.delayedLeave&&w.delayedLeave(),C[cr]=void 0)};S?I(S,[C,q]):q()},leave(C,S){const R=String(e.key);if(C[cr]&&C[cr](!0),u.isUnmounting)return S();k(p,[C]);let T=!1;const P=C[Zt]=q=>{T||(T=!0,S(),q?k(_,[C]):k(h,[C]),C[Zt]=void 0,x[R]===e&&delete x[R])};x[R]=e,d?I(d,[C,P]):P()},clone(C){const S=zu(C,t,u,n,r);return r&&r(S),S}};return w}function Bi(e){if(Wn(e))return e=vt(e),e.children=null,e}function tc(e){if(!Wn(e))return Ql(e.type)&&e.children?na(e.children):e;const{shapeFlag:t,children:u}=e;if(u){if(t&16)return u[0];if(t&32&&W(u.default))return u.default()}}function Bt(e,t){e.shapeFlag&6&&e.component?(e.transition=t,Bt(e.component.subTree,t)):e.shapeFlag&128?(e.ssContent.transition=t.clone(e.ssContent),e.ssFallback.transition=t.clone(e.ssFallback)):e.transition=t}function pi(e,t=!1,u){let n=[],r=0;for(let i=0;i<e.length;i++){let s=e[i];const o=u==null?s.key:String(u)+String(s.key!=null?s.key:i);s.type===Ee?(s.patchFlag&128&&r++,n=n.concat(pi(s.children,t,o))):(t||s.type!==ye)&&n.push(o!=null?vt(s,{key:o}):s)}if(r>1)for(let i=0;i<n.length;i++)n[i].patchFlag=-2;return n}/*! #__NO_SIDE_EFFECTS__ */function Kn(e,t){return W(e)?te({name:e.name},t,{setup:e}):e}function Sh(){const e=ft();return e?(e.appContext.config.idPrefix||"v")+"-"+e.ids[0]+e.ids[1]++:""}function Xs(e){e.ids=[e.ids[0]+e.ids[2]+++"-",0,0]}function vh(e){const t=ft(),u=js(null);if(t){const r=t.refs===ee?t.refs={}:t.refs;Object.defineProperty(r,e,{enumerable:!0,get:()=>u.value,set:i=>u.value=i})}return u}function On(e,t,u,n,r=!1){if($(e)){e.forEach((h,_)=>On(h,t&&($(t)?t[_]:t),u,n,r));return}if(nu(n)&&!r){n.shapeFlag&512&&n.type.__asyncResolved&&n.component.subTree.component&&On(e,t,u,n.component.subTree);return}const i=n.shapeFlag&4?Jn(n.component):n.el,s=r?null:i,{i:o,r:c}=e,l=t&&t.r,a=o.refs===ee?o.refs={}:o.refs,f=o.setupState,p=Q(f),d=f===ee?()=>!1:h=>re(p,h);if(l!=null&&l!==c&&(Z(l)?(a[l]=null,d(l)&&(f[l]=null)):Ce(l)&&(l.value=null)),W(c))wu(c,o,12,[s,a]);else{const h=Z(c),_=Ce(c);if(h||_){const v=()=>{if(e.f){const A=h?d(c)?f[c]:a[c]:c.value;r?$(A)&&Ms(A,i):$(A)?A.includes(i)||A.push(i):h?(a[c]=[i],d(c)&&(f[c]=a[c])):(c.value=[i],e.k&&(a[e.k]=c.value))}else h?(a[c]=s,d(c)&&(f[c]=s)):_&&(c.value=s,e.k&&(a[e.k]=s))};s?(v.id=-1,ke(v,u)):v()}}}let uc=!1;const au=()=>{uc||(console.error("Hydration completed but contains mismatches."),uc=!0)},Dh=e=>e.namespaceURI.includes("svg")&&e.tagName!=="foreignObject",Th=e=>e.namespaceURI.includes("MathML"),lr=e=>{if(e.nodeType===1){if(Dh(e))return"svg";if(Th(e))return"mathml"}},pu=e=>e.nodeType===8;function wh(e){const{mt:t,p:u,o:{patchProp:n,createText:r,nextSibling:i,parentNode:s,remove:o,insert:c,createComment:l}}=e,a=(m,g)=>{if(!g.hasChildNodes()){__VUE_PROD_HYDRATION_MISMATCH_DETAILS__&&zt("Attempting to hydrate existing markup but container is empty. Performing full mount instead."),u(null,m,g),Fr(),g._vnode=m;return}f(g.firstChild,m,null,null,null),Fr(),g._vnode=m},f=(m,g,x,k,I,w=!1)=>{w=w||!!g.dynamicChildren;const C=pu(m)&&m.data==="[",S=()=>_(m,g,x,k,I,C),{type:R,ref:T,shapeFlag:P,patchFlag:q}=g;let z=m.nodeType;g.el=m,__VUE_PROD_DEVTOOLS__&&(ju(m,"__vnode",g,!0),ju(m,"__vueParentComponent",x,!0)),q===-2&&(w=!1,g.dynamicChildren=null);let H=null;switch(R){case Pt:z!==3?g.children===""?(c(g.el=r(""),s(m),m),H=m):H=S():(m.data!==g.children&&(__VUE_PROD_HYDRATION_MISMATCH_DETAILS__&&zt("Hydration text mismatch in",m.parentNode,`
-  - rendered on server: ${JSON.stringify(m.data)}
-  - expected on client: ${JSON.stringify(g.children)}`),au(),m.data=g.children),H=i(m));break;case ye:E(m)?(H=i(m),A(g.el=m.content.firstChild,m,x)):z!==8||C?H=S():H=i(m);break;case ru:if(C&&(m=i(m),z=m.nodeType),z===1||z===3){H=m;const J=!g.children.length;for(let j=0;j<g.staticCount;j++)J&&(g.children+=H.nodeType===1?H.outerHTML:H.data),j===g.staticCount-1&&(g.anchor=H),H=i(H);return C?i(H):H}else S();break;case Ee:C?H=h(m,g,x,k,I,w):H=S();break;default:if(P&1)(z!==1||g.type.toLowerCase()!==m.tagName.toLowerCase())&&!E(m)?H=S():H=p(m,g,x,k,I,w);else if(P&6){g.slotScopeIds=I;const J=s(m);if(C?H=v(m):pu(m)&&m.data==="teleport start"?H=v(m,m.data,"teleport end"):H=i(m),t(g,J,null,x,k,lr(J),w),nu(g)&&!g.type.__asyncResolved){let j;C?(j=fe(Ee),j.anchor=H?H.previousSibling:J.lastChild):j=m.nodeType===3?oo(""):fe("div"),j.el=m,g.component.subTree=j}}else P&64?z!==8?H=S():H=g.type.hydrate(m,g,x,k,I,w,e,d):P&128?H=g.type.hydrate(m,g,x,k,lr(s(m)),I,w,e,f):__VUE_PROD_HYDRATION_MISMATCH_DETAILS__&&zt("Invalid HostVNode type:",R,`(${typeof R})`)}return T!=null&&On(T,null,k,g),H},p=(m,g,x,k,I,w)=>{w=w||!!g.dynamicChildren;const{type:C,props:S,patchFlag:R,shapeFlag:T,dirs:P,transition:q}=g,z=C==="input"||C==="option";if(z||R!==-1){P&&kt(g,null,x,"created");let H=!1;if(E(m)){H=Na(null,q)&&x&&x.vnode.props&&x.vnode.props.appear;const j=m.content.firstChild;H&&q.beforeEnter(j),A(j,m,x),g.el=m=j}if(T&16&&!(S&&(S.innerHTML||S.textContent))){let j=d(m.firstChild,g,m,x,k,I,w),be=!1;for(;j;){pn(m,1)||(__VUE_PROD_HYDRATION_MISMATCH_DETAILS__&&!be&&(zt("Hydration children mismatch on",m,`
-Server rendered element contains more child nodes than client vdom.`),be=!0),au());const je=j;j=j.nextSibling,o(je)}}else if(T&8){let j=g.children;j[0]===`
-`&&(m.tagName==="PRE"||m.tagName==="TEXTAREA")&&(j=j.slice(1)),m.textContent!==j&&(pn(m,0)||(__VUE_PROD_HYDRATION_MISMATCH_DETAILS__&&zt("Hydration text content mismatch on",m,`
-  - rendered on server: ${m.textContent}
-  - expected on client: ${g.children}`),au()),m.textContent=g.children)}if(S){if(__VUE_PROD_HYDRATION_MISMATCH_DETAILS__||z||!w||R&48){const j=m.tagName.includes("-");for(const be in S)__VUE_PROD_HYDRATION_MISMATCH_DETAILS__&&!(P&&P.some(je=>je.dir.created))&&Fh(m,be,S[be],g,x)&&au(),(z&&(be.endsWith("value")||be==="indeterminate")||vu(be)&&!tu(be)||be[0]==="."||j)&&n(m,be,null,S[be],void 0,x)}else if(S.onClick)n(m,"onClick",null,S.onClick,void 0,x);else if(R&4&&uu(S.style))for(const j in S.style)S.style[j]}let J;(J=S&&S.onVnodeBeforeMount)&&Ke(J,x,g),P&&kt(g,null,x,"beforeMount"),((J=S&&S.onVnodeMounted)||P||H)&&Ua(()=>{J&&Ke(J,x,g),H&&q.enter(m),P&&kt(g,null,x,"mounted")},k)}return m.nextSibling},d=(m,g,x,k,I,w,C)=>{C=C||!!g.dynamicChildren;const S=g.children,R=S.length;let T=!1;for(let P=0;P<R;P++){const q=C?S[P]:S[P]=We(S[P]),z=q.type===Pt;m?(z&&!C&&P+1<R&&We(S[P+1]).type===Pt&&(c(r(m.data.slice(q.children.length)),x,i(m)),m.data=q.children),m=f(m,q,k,I,w,C)):z&&!q.children?c(q.el=r(""),x):(pn(x,1)||(__VUE_PROD_HYDRATION_MISMATCH_DETAILS__&&!T&&(zt("Hydration children mismatch on",x,`
-Server rendered element contains fewer child nodes than client vdom.`),T=!0),au()),u(null,q,x,null,k,I,lr(x),w))}return m},h=(m,g,x,k,I,w)=>{const{slotScopeIds:C}=g;C&&(I=I?I.concat(C):C);const S=s(m),R=d(i(m),g,S,x,k,I,w);return R&&pu(R)&&R.data==="]"?i(g.anchor=R):(au(),c(g.anchor=l("]"),S,R),R)},_=(m,g,x,k,I,w)=>{if(pn(m.parentElement,1)||(__VUE_PROD_HYDRATION_MISMATCH_DETAILS__&&zt(`Hydration node mismatch:
-- rendered on server:`,m,m.nodeType===3?"(text)":pu(m)&&m.data==="["?"(start of fragment)":"",`
-- expected on client:`,g.type),au()),g.el=null,w){const R=v(m);for(;;){const T=i(m);if(T&&T!==R)o(T);else break}}const C=i(m),S=s(m);return o(m),u(null,g,S,C,x,k,lr(S),I),x&&(x.vnode.el=g.el,xi(x,g.el)),C},v=(m,g="[",x="]")=>{let k=0;for(;m;)if(m=i(m),m&&pu(m)&&(m.data===g&&k++,m.data===x)){if(k===0)return i(m);k--}return m},A=(m,g,x)=>{const k=g.parentNode;k&&k.replaceChild(m,g);let I=x;for(;I;)I.vnode.el===g&&(I.vnode.el=I.subTree.el=m),I=I.parent},E=m=>m.nodeType===1&&m.tagName==="TEMPLATE";return[a,f]}function Fh(e,t,u,n,r){let i,s,o,c;if(t==="class")o=e.getAttribute("class"),c=nn(u),Ih(nc(o||""),nc(c))||(i=2,s="class");else if(t==="style"){o=e.getAttribute("style")||"",c=Z(u)?u:ed(un(u));const l=rc(o),a=rc(c);if(n.dirs)for(const{dir:f,value:p}of n.dirs)f.name==="show"&&!p&&a.set("display","none");r&&sa(r,n,a),Oh(l,a)||(i=3,s="style")}else(e instanceof SVGElement&&dd(t)||e instanceof HTMLElement&&(Go(t)||fd(t)))&&(Go(t)?(o=e.hasAttribute(t),c=Bs(u)):u==null?(o=e.hasAttribute(t),c=!1):(e.hasAttribute(t)?o=e.getAttribute(t):t==="value"&&e.tagName==="TEXTAREA"?o=e.value:o=!1,c=hd(u)?String(u):!1),o!==c&&(i=4,s=t));if(i!=null&&!pn(e,i)){const l=p=>p===!1?"(not rendered)":`${s}="${p}"`,a=`Hydration ${oa[i]} mismatch on`,f=`
-  - rendered on server: ${l(o)}
-  - expected on client: ${l(c)}
-  Note: this mismatch is check-only. The DOM will not be rectified in production due to performance overhead.
-  You should fix the source of the mismatch.`;return zt(a,e,f),!0}return!1}function nc(e){return new Set(e.trim().split(/\s+/))}function Ih(e,t){if(e.size!==t.size)return!1;for(const u of e)if(!t.has(u))return!1;return!0}function rc(e){const t=new Map;for(const u of e.split(";")){let[n,r]=u.split(":");n=n.trim(),r=r&&r.trim(),n&&r&&t.set(n,r)}return t}function Oh(e,t){if(e.size!==t.size)return!1;for(const[u,n]of e)if(n!==t.get(u))return!1;return!0}function sa(e,t,u){const n=e.subTree;if(e.getCssVars&&(t===n||n&&n.type===Ee&&n.children.includes(t))){const r=e.getCssVars();for(const i in r)u.set(`--${md(i)}`,String(r[i]))}t===n&&e.parent&&sa(e.parent,e.vnode,u)}const ic="data-allow-mismatch",oa={0:"text",1:"children",2:"class",3:"style",4:"attribute"};function pn(e,t){if(t===0||t===1)for(;e&&!e.hasAttribute(ic);)e=e.parentElement;const u=e&&e.getAttribute(ic);if(u==null)return!1;if(u==="")return!0;{const n=u.split(",");return t===0&&n.includes("children")?!0:u.split(",").includes(oa[t])}}const Nh=gu().requestIdleCallback||(e=>setTimeout(e,1)),Rh=gu().cancelIdleCallback||(e=>clearTimeout(e)),Mh=(e=1e4)=>t=>{const u=Nh(t,{timeout:e});return()=>Rh(u)};function Ph(e){const{top:t,left:u,bottom:n,right:r}=e.getBoundingClientRect(),{innerHeight:i,innerWidth:s}=window;return(t>0&&t<i||n>0&&n<i)&&(u>0&&u<s||r>0&&r<s)}const Lh=e=>(t,u)=>{const n=new IntersectionObserver(r=>{for(const i of r)if(i.isIntersecting){n.disconnect(),t();break}},e);return u(r=>{if(r instanceof Element){if(Ph(r))return t(),n.disconnect(),!1;n.observe(r)}}),()=>n.disconnect()},Bh=e=>t=>{if(e){const u=matchMedia(e);if(u.matches)t();else return u.addEventListener("change",t,{once:!0}),()=>u.removeEventListener("change",t)}},Vh=(e=[])=>(t,u)=>{Z(e)&&(e=[e]);let n=!1;const r=s=>{n||(n=!0,i(),t(),s.target.dispatchEvent(new s.constructor(s.type,s)))},i=()=>{u(s=>{for(const o of e)s.removeEventListener(o,r)})};return u(s=>{for(const o of e)s.addEventListener(o,r,{once:!0})}),i};function $h(e,t){if(pu(e)&&e.data==="["){let u=1,n=e.nextSibling;for(;n;){if(n.nodeType===1){if(t(n)===!1)break}else if(pu(n))if(n.data==="]"){if(--u===0)break}else n.data==="["&&u++;n=n.nextSibling}}else t(e)}const nu=e=>!!e.type.__asyncLoader;/*! #__NO_SIDE_EFFECTS__ */function Hh(e){W(e)&&(e={loader:e});const{loader:t,loadingComponent:u,errorComponent:n,delay:r=200,hydrate:i,timeout:s,suspensible:o=!0,onError:c}=e;let l=null,a,f=0;const p=()=>(f++,l=null,d()),d=()=>{let h;return l||(h=l=t().catch(_=>{if(_=_ instanceof Error?_:new Error(String(_)),c)return new Promise((v,A)=>{c(_,()=>v(p()),()=>A(_),f+1)});throw _}).then(_=>h!==l&&l?l:(_&&(_.__esModule||_[Symbol.toStringTag]==="Module")&&(_=_.default),a=_,_)))};return Kn({name:"AsyncComponentWrapper",__asyncLoader:d,__asyncHydrate(h,_,v){const A=i?()=>{const E=i(v,m=>$h(h,m));E&&(_.bum||(_.bum=[])).push(E)}:v;a?A():d().then(()=>!_.isUnmounted&&A())},get __asyncResolved(){return a},setup(){const h=Se;if(Xs(h),a)return()=>Vi(a,h);const _=m=>{l=null,Fu(m,h,13,!n)};if(o&&h.suspense||Ku)return d().then(m=>()=>Vi(m,h)).catch(m=>(_(m),()=>n?fe(n,{error:m}):null));const v=yn(!1),A=yn(),E=yn(!!r);return r&&setTimeout(()=>{E.value=!1},r),s!=null&&setTimeout(()=>{if(!v.value&&!A.value){const m=new Error(`Async component timed out after ${s}ms.`);_(m),A.value=m}},s),d().then(()=>{v.value=!0,h.parent&&Wn(h.parent.vnode)&&h.parent.update()}).catch(m=>{_(m),A.value=m}),()=>{if(v.value&&a)return Vi(a,h);if(A.value&&n)return fe(n,{error:A.value});if(u&&!E.value)return fe(u)}}})}function Vi(e,t){const{ref:u,props:n,children:r,ce:i}=t.vnode,s=fe(e,n,r);return s.ref=u,s.ce=i,delete t.vnode.ce,s}const Wn=e=>e.type.__isKeepAlive,Uh={name:"KeepAlive",__isKeepAlive:!0,props:{include:[String,RegExp,Array],exclude:[String,RegExp,Array],max:[String,Number]},setup(e,{slots:t}){const u=ft(),n=u.ctx;if(!n.renderer)return()=>{const E=t.default&&t.default();return E&&E.length===1?E[0]:E};const r=new Map,i=new Set;let s=null;__VUE_PROD_DEVTOOLS__&&(u.__v_cache=r);const o=u.suspense,{renderer:{p:c,m:l,um:a,o:{createElement:f}}}=n,p=f("div");n.activate=(E,m,g,x,k)=>{const I=E.component;l(E,m,g,0,o),c(I.vnode,E,m,g,I,o,x,E.slotScopeIds,k),ke(()=>{I.isDeactivated=!1,I.a&&Vu(I.a);const w=E.props&&E.props.onVnodeMounted;w&&Ke(w,I.parent,E)},o),__VUE_PROD_DEVTOOLS__&&ss(I)},n.deactivate=E=>{const m=E.component;Or(m.m),Or(m.a),l(E,p,null,1,o),ke(()=>{m.da&&Vu(m.da);const g=E.props&&E.props.onVnodeUnmounted;g&&Ke(g,m.parent,E),m.isDeactivated=!0},o),__VUE_PROD_DEVTOOLS__&&ss(m)};function d(E){$i(E),a(E,u,o,!0)}function h(E){r.forEach((m,g)=>{const x=Lr(m.type);x&&!E(x)&&_(g)})}function _(E){const m=r.get(E);m&&(!s||!mt(m,s))?d(m):s&&$i(s),r.delete(E),i.delete(E)}Hu(()=>[e.include,e.exclude],([E,m])=>{E&&h(g=>mn(E,g)),m&&h(g=>!mn(m,g))},{flush:"post",deep:!0});let v=null;const A=()=>{v!=null&&(Nr(u.subTree.type)?ke(()=>{r.set(v,ar(u.subTree))},u.subTree.suspense):r.set(v,ar(u.subTree)))};return Gn(A),bi(A),gi(()=>{r.forEach(E=>{const{subTree:m,suspense:g}=u,x=ar(m);if(E.type===x.type&&E.key===x.key){$i(x);const k=x.component.da;k&&ke(k,g);return}d(E)})}),()=>{if(v=null,!t.default)return s=null;const E=t.default(),m=E[0];if(E.length>1)return s=null,E;if(!Vt(m)||!(m.shapeFlag&4)&&!(m.shapeFlag&128))return s=null,m;let g=ar(m);if(g.type===ye)return s=null,g;const x=g.type,k=Lr(nu(g)?g.type.__asyncResolved||{}:x),{include:I,exclude:w,max:C}=e;if(I&&(!k||!mn(I,k))||w&&k&&mn(w,k))return g.shapeFlag&=-257,s=g,m;const S=g.key==null?x:g.key,R=r.get(S);return g.el&&(g=vt(g),m.shapeFlag&128&&(m.ssContent=g)),v=S,R?(g.el=R.el,g.component=R.component,g.transition&&Bt(g,g.transition),g.shapeFlag|=512,i.delete(S),i.add(S)):(i.add(S),C&&i.size>parseInt(C,10)&&_(i.values().next().value)),g.shapeFlag|=256,s=g,Nr(m.type)?m:g}}},qh=Uh;function mn(e,t){return $(e)?e.some(u=>mn(u,t)):Z(e)?e.split(",").includes(t):qf(e)?(e.lastIndex=0,e.test(t)):!1}function ca(e,t){aa(e,"a",t)}function la(e,t){aa(e,"da",t)}function aa(e,t,u=Se){const n=e.__wdc||(e.__wdc=()=>{let r=u;for(;r;){if(r.isDeactivated)return;r=r.parent}return e()});if(mi(t,n,u),u){let r=u.parent;for(;r&&r.parent;)Wn(r.parent.vnode)&&jh(n,t,u,r),r=r.parent}}function jh(e,t,u,n){const r=mi(t,e,n,!0);_i(()=>{Ms(n[t],r)},u)}function $i(e){e.shapeFlag&=-257,e.shapeFlag&=-513}function ar(e){return e.shapeFlag&128?e.ssContent:e}function mi(e,t,u=Se,n=!1){if(u){const r=u[e]||(u[e]=[]),i=t.__weh||(t.__weh=(...s)=>{$t();const o=Au(u),c=lt(t,u,e,s);return o(),Ht(),c});return n?r.unshift(i):r.push(i),i}}const Ut=e=>(t,u=Se)=>{(!Ku||e==="sp")&&mi(e,(...n)=>t(...n),u)},fa=Ut("bm"),Gn=Ut("m"),Qs=Ut("bu"),bi=Ut("u"),gi=Ut("bum"),_i=Ut("um"),da=Ut("sp"),ha=Ut("rtg"),pa=Ut("rtc");function ma(e,t=Se){mi("ec",e,t)}const eo="components",zh="directives";function Kh(e,t){return to(eo,e,!0,t)||e}const ba=Symbol.for("v-ndc");function Wh(e){return Z(e)?to(eo,e,!1)||e:e||ba}function Gh(e){return to(zh,e)}function to(e,t,u=!0,n=!1){const r=ve||Se;if(r){const i=r.type;if(e===eo){const o=Lr(i,!1);if(o&&(o===t||o===de(t)||o===Tu(de(t))))return i}const s=sc(r[e]||i[e],t)||sc(r.appContext[e],t);return!s&&n?i:s}}function sc(e,t){return e&&(e[t]||e[de(t)]||e[Tu(de(t))])}function Zh(e,t,u,n){let r;const i=u&&u[n],s=$(e);if(s||Z(e)){const o=s&&uu(e);let c=!1;o&&(c=!ut(e),e=ii(e)),r=new Array(e.length);for(let l=0,a=e.length;l<a;l++)r[l]=t(c?Me(e[l]):e[l],l,void 0,i&&i[l])}else if(typeof e=="number"){r=new Array(e);for(let o=0;o<e;o++)r[o]=t(o+1,o,void 0,i&&i[o])}else if(oe(e))if(e[Symbol.iterator])r=Array.from(e,(o,c)=>t(o,c,void 0,i&&i[c]));else{const o=Object.keys(e);r=new Array(o.length);for(let c=0,l=o.length;c<l;c++){const a=o[c];r[c]=t(e[a],a,c,i&&i[c])}}else r=[];return u&&(u[n]=r),r}function Jh(e,t){for(let u=0;u<t.length;u++){const n=t[u];if($(n))for(let r=0;r<n.length;r++)e[n[r].name]=n[r].fn;else n&&(e[n.name]=n.key?(...r)=>{const i=n.fn(...r);return i&&(i.key=n.key),i}:n.fn)}return e}function Yh(e,t,u={},n,r){if(ve.ce||ve.parent&&nu(ve.parent)&&ve.parent.ce)return t!=="default"&&(u.name=t),Mn(),Rr(Ee,null,[fe("slot",u,n&&n())],64);let i=e[t];i&&i._c&&(i._d=!1),Mn();const s=i&&uo(i(u)),o=u.key||s&&s.key,c=Rr(Ee,{key:(o&&!Ze(o)?o:`_${t}`)+(!s&&n?"_fb":"")},s||(n?n():[]),s&&e._===1?64:-2);return!r&&c.scopeId&&(c.slotScopeIds=[c.scopeId+"-s"]),i&&i._c&&(i._d=!0),c}function uo(e){return e.some(t=>Vt(t)?!(t.type===ye||t.type===Ee&&!uo(t.children)):!0)?e:null}function Xh(e,t){const u={};for(const n in e)u[t&&/[A-Z]/.test(n)?`on:${n}`:Bu(n)]=e[n];return u}const cs=e=>e?Za(e)?Jn(e):cs(e.parent):null,En=te(Object.create(null),{$:e=>e,$el:e=>e.vnode.el,$data:e=>e.data,$props:e=>e.props,$attrs:e=>e.attrs,$slots:e=>e.slots,$refs:e=>e.refs,$parent:e=>cs(e.parent),$root:e=>cs(e.root),$host:e=>e.ce,$emit:e=>e.emit,$options:e=>__VUE_OPTIONS_API__?no(e):e.type,$forceUpdate:e=>e.f||(e.f=()=>{Ks(e.update)}),$nextTick:e=>e.n||(e.n=fi.bind(e.proxy)),$watch:e=>__VUE_OPTIONS_API__?I1.bind(e):we}),Hi=(e,t)=>e!==ee&&!e.__isScriptSetup&&re(e,t),ls={get({_:e},t){if(t==="__v_skip")return!0;const{ctx:u,setupState:n,data:r,props:i,accessCache:s,type:o,appContext:c}=e;let l;if(t[0]!=="$"){const d=s[t];if(d!==void 0)switch(d){case 1:return n[t];case 2:return r[t];case 4:return u[t];case 3:return i[t]}else{if(Hi(n,t))return s[t]=1,n[t];if(r!==ee&&re(r,t))return s[t]=2,r[t];if((l=e.propsOptions[0])&&re(l,t))return s[t]=3,i[t];if(u!==ee&&re(u,t))return s[t]=4,u[t];(!__VUE_OPTIONS_API__||as)&&(s[t]=0)}}const a=En[t];let f,p;if(a)return t==="$attrs"&&Re(e.attrs,"get",""),a(e);if((f=o.__cssModules)&&(f=f[t]))return f;if(u!==ee&&re(u,t))return s[t]=4,u[t];if(p=c.config.globalProperties,re(p,t))return p[t]},set({_:e},t,u){const{data:n,setupState:r,ctx:i}=e;return Hi(r,t)?(r[t]=u,!0):n!==ee&&re(n,t)?(n[t]=u,!0):re(e.props,t)||t[0]==="$"&&t.slice(1)in e?!1:(i[t]=u,!0)},has({_:{data:e,setupState:t,accessCache:u,ctx:n,appContext:r,propsOptions:i}},s){let o;return!!u[s]||e!==ee&&re(e,s)||Hi(t,s)||(o=i[0])&&re(o,s)||re(n,s)||re(En,s)||re(r.config.globalProperties,s)},defineProperty(e,t,u){return u.get!=null?e._.accessCache[t]=0:re(u,"value")&&this.set(e,t,u.value,null),Reflect.defineProperty(e,t,u)}},Qh=te({},ls,{get(e,t){if(t!==Symbol.unscopables)return ls.get(e,t,e)},has(e,t){return t[0]!=="_"&&!Jf(t)}});function e1(){return null}function t1(){return null}function u1(e){}function n1(e){}function r1(){return null}function i1(){}function s1(e,t){return null}function o1(){return ga().slots}function c1(){return ga().attrs}function ga(){const e=ft();return e.setupContext||(e.setupContext=Qa(e))}function Nn(e){return $(e)?e.reduce((t,u)=>(t[u]=null,t),{}):e}function l1(e,t){const u=Nn(e);for(const n in t){if(n.startsWith("__skip"))continue;let r=u[n];r?$(r)||W(r)?r=u[n]={type:r,default:t[n]}:r.default=t[n]:r===null&&(r=u[n]={default:t[n]}),r&&t[`__skip_${n}`]&&(r.skipFactory=!0)}return u}function a1(e,t){return!e||!t?e||t:$(e)&&$(t)?e.concat(t):te({},Nn(e),Nn(t))}function f1(e,t){const u={};for(const n in e)t.includes(n)||Object.defineProperty(u,n,{enumerable:!0,get:()=>e[n]});return u}function d1(e){const t=ft();let u=e();return bs(),Ps(u)&&(u=u.catch(n=>{throw Au(t),n})),[u,()=>Au(t)]}let as=!0;function h1(e){const t=no(e),u=e.proxy,n=e.ctx;as=!1,t.beforeCreate&&oc(t.beforeCreate,e,"bc");const{data:r,computed:i,methods:s,watch:o,provide:c,inject:l,created:a,beforeMount:f,mounted:p,beforeUpdate:d,updated:h,activated:_,deactivated:v,beforeDestroy:A,beforeUnmount:E,destroyed:m,unmounted:g,render:x,renderTracked:k,renderTriggered:I,errorCaptured:w,serverPrefetch:C,expose:S,inheritAttrs:R,components:T,directives:P,filters:q}=t;if(l&&p1(l,n,null),s)for(const J in s){const j=s[J];W(j)&&(n[J]=j.bind(u))}if(r){const J=r.call(u,u);oe(J)&&(e.data=oi(J))}if(as=!0,i)for(const J in i){const j=i[J],be=W(j)?j.bind(u,u):W(j.get)?j.get.bind(u,u):we,je=!W(j)&&W(j.set)?j.set.bind(u):we,dt=lo({get:be,set:je});Object.defineProperty(n,J,{enumerable:!0,configurable:!0,get:()=>dt.value,set:yt=>dt.value=yt})}if(o)for(const J in o)_a(o[J],n,u,J);if(c){const J=W(c)?c.call(u):c;Reflect.ownKeys(J).forEach(j=>{xa(j,J[j])})}a&&oc(a,e,"c");function H(J,j){$(j)?j.forEach(be=>J(be.bind(u))):j&&J(j.bind(u))}if(H(fa,f),H(Gn,p),H(Qs,d),H(bi,h),H(ca,_),H(la,v),H(ma,w),H(pa,k),H(ha,I),H(gi,E),H(_i,g),H(da,C),$(S))if(S.length){const J=e.exposed||(e.exposed={});S.forEach(j=>{Object.defineProperty(J,j,{get:()=>u[j],set:be=>u[j]=be})})}else e.exposed||(e.exposed={});x&&e.render===we&&(e.render=x),R!=null&&(e.inheritAttrs=R),T&&(e.components=T),P&&(e.directives=P),C&&Xs(e)}function p1(e,t,u=we){$(e)&&(e=fs(e));for(const n in e){const r=e[n];let i;oe(r)?"default"in r?i=Cn(r.from||n,r.default,!0):i=Cn(r.from||n):i=Cn(r),Ce(i)?Object.defineProperty(t,n,{enumerable:!0,configurable:!0,get:()=>i.value,set:s=>i.value=s}):t[n]=i}}function oc(e,t,u){lt($(e)?e.map(n=>n.bind(t.proxy)):e.bind(t.proxy),t,u)}function _a(e,t,u,n){let r=n.includes(".")?Ba(u,n):()=>u[n];if(Z(e)){const i=t[e];W(i)&&Hu(r,i)}else if(W(e))Hu(r,e.bind(u));else if(oe(e))if($(e))e.forEach(i=>_a(i,t,u,n));else{const i=W(e.handler)?e.handler.bind(u):t[e.handler];W(i)&&Hu(r,i,e)}}function no(e){const t=e.type,{mixins:u,extends:n}=t,{mixins:r,optionsCache:i,config:{optionMergeStrategies:s}}=e.appContext,o=i.get(t);let c;return o?c=o:!r.length&&!u&&!n?c=t:(c={},r.length&&r.forEach(l=>Ir(c,l,s,!0)),Ir(c,t,s)),oe(t)&&i.set(t,c),c}function Ir(e,t,u,n=!1){const{mixins:r,extends:i}=t;i&&Ir(e,i,u,!0),r&&r.forEach(s=>Ir(e,s,u,!0));for(const s in t)if(!(n&&s==="expose")){const o=m1[s]||u&&u[s];e[s]=o?o(e[s],t[s]):t[s]}return e}const m1={data:cc,props:lc,emits:lc,methods:bn,computed:bn,beforeCreate:Ve,created:Ve,beforeMount:Ve,mounted:Ve,beforeUpdate:Ve,updated:Ve,beforeDestroy:Ve,beforeUnmount:Ve,destroyed:Ve,unmounted:Ve,activated:Ve,deactivated:Ve,errorCaptured:Ve,serverPrefetch:Ve,components:bn,directives:bn,watch:g1,provide:cc,inject:b1};function cc(e,t){return t?e?function(){return te(W(e)?e.call(this,this):e,W(t)?t.call(this,this):t)}:t:e}function b1(e,t){return bn(fs(e),fs(t))}function fs(e){if($(e)){const t={};for(let u=0;u<e.length;u++)t[e[u]]=e[u];return t}return e}function Ve(e,t){return e?[...new Set([].concat(e,t))]:t}function bn(e,t){return e?te(Object.create(null),e,t):t}function lc(e,t){return e?$(e)&&$(t)?[...new Set([...e,...t])]:te(Object.create(null),Nn(e),Nn(t??{})):t}function g1(e,t){if(!e)return t;if(!t)return e;const u=te(Object.create(null),e);for(const n in t)u[n]=Ve(e[n],t[n]);return u}function ya(){return{app:null,config:{isNativeTag:dn,performance:!1,globalProperties:{},optionMergeStrategies:{},errorHandler:void 0,warnHandler:void 0,compilerOptions:{}},mixins:[],components:{},directives:{},provides:Object.create(null),optionsCache:new WeakMap,propsCache:new WeakMap,emitsCache:new WeakMap}}let _1=0;function y1(e,t){return function(n,r=null){W(n)||(n=te({},n)),r!=null&&!oe(r)&&(r=null);const i=ya(),s=new WeakSet,o=[];let c=!1;const l=i.app={_uid:_1++,_component:n,_props:r,_container:null,_context:i,_instance:null,version:ys,get config(){return i.config},set config(a){},use(a,...f){return s.has(a)||(a&&W(a.install)?(s.add(a),a.install(l,...f)):W(a)&&(s.add(a),a(l,...f))),l},mixin(a){return __VUE_OPTIONS_API__&&(i.mixins.includes(a)||i.mixins.push(a)),l},component(a,f){return f?(i.components[a]=f,l):i.components[a]},directive(a,f){return f?(i.directives[a]=f,l):i.directives[a]},mount(a,f,p){if(!c){const d=l._ceVNode||fe(n,r);return d.appContext=i,p===!0?p="svg":p===!1&&(p=void 0),f&&t?t(d,a):e(d,a,p),c=!0,l._container=a,a.__vue_app__=l,__VUE_PROD_DEVTOOLS__&&(l._instance=d.component,hh(l,ys)),Jn(d.component)}},onUnmount(a){o.push(a)},unmount(){c&&(lt(o,l._instance,16),e(null,l._container),__VUE_PROD_DEVTOOLS__&&(l._instance=null,ph(l)),delete l._container.__vue_app__)},provide(a,f){return i.provides[a]=f,l},runWithContext(a){const f=xu;xu=l;try{return a()}finally{xu=f}}};return l}}let xu=null;function xa(e,t){if(Se){let u=Se.provides;const n=Se.parent&&Se.parent.provides;n===u&&(u=Se.provides=Object.create(n)),u[e]=t}}function Cn(e,t,u=!1){const n=Se||ve;if(n||xu){const r=xu?xu._context.provides:n?n.parent==null?n.vnode.appContext&&n.vnode.appContext.provides:n.parent.provides:void 0;if(r&&e in r)return r[e];if(arguments.length>1)return u&&W(t)?t.call(n&&n.proxy):t}}function x1(){return!!(Se||ve||xu)}const Ea={},Ca=()=>Object.create(Ea),ka=e=>Object.getPrototypeOf(e)===Ea;function E1(e,t,u,n=!1){const r={},i=Ca();e.propsDefaults=Object.create(null),Aa(e,t,r,i);for(const s in e.propsOptions[0])s in r||(r[s]=void 0);u?e.props=n?r:Hl(r):e.type.props?e.props=r:e.props=i,e.attrs=i}function C1(e,t,u,n){const{props:r,attrs:i,vnode:{patchFlag:s}}=e,o=Q(r),[c]=e.propsOptions;let l=!1;if((n||s>0)&&!(s&16)){if(s&8){const a=e.vnode.dynamicProps;for(let f=0;f<a.length;f++){let p=a[f];if(yi(e.emitsOptions,p))continue;const d=t[p];if(c)if(re(i,p))d!==i[p]&&(i[p]=d,l=!0);else{const h=de(p);r[h]=ds(c,o,h,d,e,!1)}else d!==i[p]&&(i[p]=d,l=!0)}}}else{Aa(e,t,r,i)&&(l=!0);let a;for(const f in o)(!t||!re(t,f)&&((a=qe(f))===f||!re(t,a)))&&(c?u&&(u[f]!==void 0||u[a]!==void 0)&&(r[f]=ds(c,o,f,void 0,e,!0)):delete r[f]);if(i!==o)for(const f in i)(!t||!re(t,f))&&(delete i[f],l=!0)}l&&Nt(e.attrs,"set","")}function Aa(e,t,u,n){const[r,i]=e.propsOptions;let s=!1,o;if(t)for(let c in t){if(tu(c))continue;const l=t[c];let a;r&&re(r,a=de(c))?!i||!i.includes(a)?u[a]=l:(o||(o={}))[a]=l:yi(e.emitsOptions,c)||(!(c in n)||l!==n[c])&&(n[c]=l,s=!0)}if(i){const c=Q(u),l=o||ee;for(let a=0;a<i.length;a++){const f=i[a];u[f]=ds(r,c,f,l[f],e,!re(l,f))}}return s}function ds(e,t,u,n,r,i){const s=e[u];if(s!=null){const o=re(s,"default");if(o&&n===void 0){const c=s.default;if(s.type!==Function&&!s.skipFactory&&W(c)){const{propsDefaults:l}=r;if(u in l)n=l[u];else{const a=Au(r);n=l[u]=c.call(null,t),a()}}else n=c;r.ce&&r.ce._setProp(u,n)}s[0]&&(i&&!o?n=!1:s[1]&&(n===""||n===qe(u))&&(n=!0))}return n}const k1=new WeakMap;function Sa(e,t,u=!1){const n=__VUE_OPTIONS_API__&&u?k1:t.propsCache,r=n.get(e);if(r)return r;const i=e.props,s={},o=[];let c=!1;if(__VUE_OPTIONS_API__&&!W(e)){const a=f=>{c=!0;const[p,d]=Sa(f,t,!0);te(s,p),d&&o.push(...d)};!u&&t.mixins.length&&t.mixins.forEach(a),e.extends&&a(e.extends),e.mixins&&e.mixins.forEach(a)}if(!i&&!c)return oe(e)&&n.set(e,Pu),Pu;if($(i))for(let a=0;a<i.length;a++){const f=de(i[a]);ac(f)&&(s[f]=ee)}else if(i)for(const a in i){const f=de(a);if(ac(f)){const p=i[a],d=s[f]=$(p)||W(p)?{type:p}:te({},p),h=d.type;let _=!1,v=!0;if($(h))for(let A=0;A<h.length;++A){const E=h[A],m=W(E)&&E.name;if(m==="Boolean"){_=!0;break}else m==="String"&&(v=!1)}else _=W(h)&&h.name==="Boolean";d[0]=_,d[1]=v,(_||re(d,"default"))&&o.push(f)}}const l=[s,o];return oe(e)&&n.set(e,l),l}function ac(e){return e[0]!=="$"&&!tu(e)}const va=e=>e[0]==="_"||e==="$stable",ro=e=>$(e)?e.map(We):[We(e)],A1=(e,t,u)=>{if(t._n)return t;const n=Zs((...r)=>ro(t(...r)),u);return n._c=!1,n},Da=(e,t,u)=>{const n=e._ctx;for(const r in e){if(va(r))continue;const i=e[r];if(W(i))t[r]=A1(r,i,n);else if(i!=null){const s=ro(i);t[r]=()=>s}}},Ta=(e,t)=>{const u=ro(t);e.slots.default=()=>u},wa=(e,t,u)=>{for(const n in t)(u||n!=="_")&&(e[n]=t[n])},S1=(e,t,u)=>{const n=e.slots=Ca();if(e.vnode.shapeFlag&32){const r=t._;r?(wa(n,t,u),u&&ju(n,"_",r,!0)):Da(t,n)}else t&&Ta(e,t)},v1=(e,t,u)=>{const{vnode:n,slots:r}=e;let i=!0,s=ee;if(n.shapeFlag&32){const o=t._;o?u&&o===1?i=!1:wa(r,t,u):(i=!t.$stable,Da(t,r)),s=t}else t&&(Ta(e,t),s={default:1});if(i)for(const o in r)!va(o)&&s[o]==null&&delete r[o]};function D1(){typeof __VUE_OPTIONS_API__!="boolean"&&(gu().__VUE_OPTIONS_API__=!0),typeof __VUE_PROD_DEVTOOLS__!="boolean"&&(gu().__VUE_PROD_DEVTOOLS__=!1),typeof __VUE_PROD_HYDRATION_MISMATCH_DETAILS__!="boolean"&&(gu().__VUE_PROD_HYDRATION_MISMATCH_DETAILS__=!1)}const ke=Ua;function Fa(e){return Oa(e)}function Ia(e){return Oa(e,wh)}function Oa(e,t){D1();const u=gu();u.__VUE__=!0,__VUE_PROD_DEVTOOLS__&&Ws(u.__VUE_DEVTOOLS_GLOBAL_HOOK__,u);const{insert:n,remove:r,patchProp:i,createElement:s,createText:o,createComment:c,setText:l,setElementText:a,parentNode:f,nextSibling:p,setScopeId:d=we,insertStaticContent:h}=e,_=(b,y,D,N=null,F=null,O=null,V=void 0,B=null,L=!!y.dynamicChildren)=>{if(b===y)return;b&&!mt(b,y)&&(N=nr(b),yt(b,F,O,!0),b=null),y.patchFlag===-2&&(L=!1,y.dynamicChildren=null);const{type:M,ref:G,shapeFlag:U}=y;switch(M){case Pt:v(b,y,D,N);break;case ye:A(b,y,D,N);break;case ru:b==null&&E(y,D,N,V);break;case Ee:T(b,y,D,N,F,O,V,B,L);break;default:U&1?x(b,y,D,N,F,O,V,B,L):U&6?P(b,y,D,N,F,O,V,B,L):(U&64||U&128)&&M.process(b,y,D,N,F,O,V,B,L,Iu)}G!=null&&F&&On(G,b&&b.ref,O,y||b,!y)},v=(b,y,D,N)=>{if(b==null)n(y.el=o(y.children),D,N);else{const F=y.el=b.el;y.children!==b.children&&l(F,y.children)}},A=(b,y,D,N)=>{b==null?n(y.el=c(y.children||""),D,N):y.el=b.el},E=(b,y,D,N)=>{[b.el,b.anchor]=h(b.children,y,D,N,b.el,b.anchor)},m=({el:b,anchor:y},D,N)=>{let F;for(;b&&b!==y;)F=p(b),n(b,D,N),b=F;n(y,D,N)},g=({el:b,anchor:y})=>{let D;for(;b&&b!==y;)D=p(b),r(b),b=D;r(y)},x=(b,y,D,N,F,O,V,B,L)=>{y.type==="svg"?V="svg":y.type==="math"&&(V="mathml"),b==null?k(y,D,N,F,O,V,B,L):C(b,y,F,O,V,B,L)},k=(b,y,D,N,F,O,V,B)=>{let L,M;const{props:G,shapeFlag:U,transition:K,dirs:Y}=b;if(L=b.el=s(b.type,O,G&&G.is,G),U&8?a(L,b.children):U&16&&w(b.children,L,null,N,F,Ui(b,O),V,B),Y&&kt(b,null,N,"created"),I(L,b,b.scopeId,V,N),G){for(const ce in G)ce!=="value"&&!tu(ce)&&i(L,ce,null,G[ce],O,N);"value"in G&&i(L,"value",null,G.value,O),(M=G.onVnodeBeforeMount)&&Ke(M,N,b)}__VUE_PROD_DEVTOOLS__&&(ju(L,"__vnode",b,!0),ju(L,"__vueParentComponent",N,!0)),Y&&kt(b,null,N,"beforeMount");const ue=Na(F,K);ue&&K.beforeEnter(L),n(L,y,D),((M=G&&G.onVnodeMounted)||ue||Y)&&ke(()=>{M&&Ke(M,N,b),ue&&K.enter(L),Y&&kt(b,null,N,"mounted")},F)},I=(b,y,D,N,F)=>{if(D&&d(b,D),N)for(let O=0;O<N.length;O++)d(b,N[O]);if(F){let O=F.subTree;if(y===O||Nr(O.type)&&(O.ssContent===y||O.ssFallback===y)){const V=F.vnode;I(b,V,V.scopeId,V.slotScopeIds,F.parent)}}},w=(b,y,D,N,F,O,V,B,L=0)=>{for(let M=L;M<b.length;M++){const G=b[M]=B?Jt(b[M]):We(b[M]);_(null,G,y,D,N,F,O,V,B)}},C=(b,y,D,N,F,O,V)=>{const B=y.el=b.el;__VUE_PROD_DEVTOOLS__&&(B.__vnode=y);let{patchFlag:L,dynamicChildren:M,dirs:G}=y;L|=b.patchFlag&16;const U=b.props||ee,K=y.props||ee;let Y;if(D&&fu(D,!1),(Y=K.onVnodeBeforeUpdate)&&Ke(Y,D,y,b),G&&kt(y,b,D,"beforeUpdate"),D&&fu(D,!0),(U.innerHTML&&K.innerHTML==null||U.textContent&&K.textContent==null)&&a(B,""),M?S(b.dynamicChildren,M,B,D,N,Ui(y,F),O):V||j(b,y,B,null,D,N,Ui(y,F),O,!1),L>0){if(L&16)R(B,U,K,D,F);else if(L&2&&U.class!==K.class&&i(B,"class",null,K.class,F),L&4&&i(B,"style",U.style,K.style,F),L&8){const ue=y.dynamicProps;for(let ce=0;ce<ue.length;ce++){const se=ue[ce],Ye=U[se],Ie=K[se];(Ie!==Ye||se==="value")&&i(B,se,Ye,Ie,F,D)}}L&1&&b.children!==y.children&&a(B,y.children)}else!V&&M==null&&R(B,U,K,D,F);((Y=K.onVnodeUpdated)||G)&&ke(()=>{Y&&Ke(Y,D,y,b),G&&kt(y,b,D,"updated")},N)},S=(b,y,D,N,F,O,V)=>{for(let B=0;B<y.length;B++){const L=b[B],M=y[B],G=L.el&&(L.type===Ee||!mt(L,M)||L.shapeFlag&70)?f(L.el):D;_(L,M,G,null,N,F,O,V,!0)}},R=(b,y,D,N,F)=>{if(y!==D){if(y!==ee)for(const O in y)!tu(O)&&!(O in D)&&i(b,O,y[O],null,F,N);for(const O in D){if(tu(O))continue;const V=D[O],B=y[O];V!==B&&O!=="value"&&i(b,O,B,V,F,N)}"value"in D&&i(b,"value",y.value,D.value,F)}},T=(b,y,D,N,F,O,V,B,L)=>{const M=y.el=b?b.el:o(""),G=y.anchor=b?b.anchor:o("");let{patchFlag:U,dynamicChildren:K,slotScopeIds:Y}=y;Y&&(B=B?B.concat(Y):Y),b==null?(n(M,D,N),n(G,D,N),w(y.children||[],D,G,F,O,V,B,L)):U>0&&U&64&&K&&b.dynamicChildren?(S(b.dynamicChildren,K,D,F,O,V,B),(y.key!=null||F&&y===F.subTree)&&io(b,y,!0)):j(b,y,D,G,F,O,V,B,L)},P=(b,y,D,N,F,O,V,B,L)=>{y.slotScopeIds=B,b==null?y.shapeFlag&512?F.ctx.activate(y,D,N,V,L):q(y,D,N,F,O,V,L):z(b,y,L)},q=(b,y,D,N,F,O,V)=>{const B=b.component=Ga(b,N,F);if(Wn(b)&&(B.ctx.renderer=Iu),Ja(B,!1,V),B.asyncDep){if(F&&F.registerDep(B,H,V),!b.el){const L=B.subTree=fe(ye);A(null,L,y,D)}}else H(B,b,y,D,F,O,V)},z=(b,y,D)=>{const N=y.component=b.component;if(L1(b,y,D))if(N.asyncDep&&!N.asyncResolved){J(N,y,D);return}else N.next=y,N.update();else y.el=b.el,N.vnode=y},H=(b,y,D,N,F,O,V)=>{const B=()=>{if(b.isMounted){let{next:U,bu:K,u:Y,parent:ue,vnode:ce}=b;{const Xe=Ra(b);if(Xe){U&&(U.el=ce.el,J(b,U,V)),Xe.asyncDep.then(()=>{b.isUnmounted||B()});return}}let se=U,Ye;fu(b,!1),U?(U.el=ce.el,J(b,U,V)):U=ce,K&&Vu(K),(Ye=U.props&&U.props.onVnodeBeforeUpdate)&&Ke(Ye,ue,U,ce),fu(b,!0);const Ie=_r(b),ht=b.subTree;b.subTree=Ie,_(ht,Ie,f(ht.el),nr(ht),b,F,O),U.el=Ie.el,se===null&&xi(b,Ie.el),Y&&ke(Y,F),(Ye=U.props&&U.props.onVnodeUpdated)&&ke(()=>Ke(Ye,ue,U,ce),F),__VUE_PROD_DEVTOOLS__&&Yl(b)}else{let U;const{el:K,props:Y}=y,{bm:ue,m:ce,parent:se,root:Ye,type:Ie}=b,ht=nu(y);if(fu(b,!1),ue&&Vu(ue),!ht&&(U=Y&&Y.onVnodeBeforeMount)&&Ke(U,se,y),fu(b,!0),K&&Oi){const Xe=()=>{b.subTree=_r(b),Oi(K,b.subTree,b,F,null)};ht&&Ie.__asyncHydrate?Ie.__asyncHydrate(K,b,Xe):Xe()}else{Ye.ce&&Ye.ce._injectChildStyle(Ie);const Xe=b.subTree=_r(b);_(null,Xe,D,N,b,F,O),y.el=Xe.el}if(ce&&ke(ce,F),!ht&&(U=Y&&Y.onVnodeMounted)){const Xe=y;ke(()=>Ke(U,se,Xe),F)}(y.shapeFlag&256||se&&nu(se.vnode)&&se.vnode.shapeFlag&256)&&b.a&&ke(b.a,F),b.isMounted=!0,__VUE_PROD_DEVTOOLS__&&ss(b),y=D=N=null}};b.scope.on();const L=b.effect=new vn(B);b.scope.off();const M=b.update=L.run.bind(L),G=b.job=L.runIfDirty.bind(L);G.i=b,G.id=b.uid,L.scheduler=()=>Ks(G),fu(b,!0),M()},J=(b,y,D)=>{y.component=b;const N=b.vnode.props;b.vnode=y,b.next=null,C1(b,y.props,N,D),v1(b,y.children,D),$t(),Yo(b),Ht()},j=(b,y,D,N,F,O,V,B,L=!1)=>{const M=b&&b.children,G=b?b.shapeFlag:0,U=y.children,{patchFlag:K,shapeFlag:Y}=y;if(K>0){if(K&128){je(M,U,D,N,F,O,V,B,L);return}else if(K&256){be(M,U,D,N,F,O,V,B,L);return}}Y&8?(G&16&&sn(M,F,O),U!==M&&a(D,U)):G&16?Y&16?je(M,U,D,N,F,O,V,B,L):sn(M,F,O,!0):(G&8&&a(D,""),Y&16&&w(U,D,N,F,O,V,B,L))},be=(b,y,D,N,F,O,V,B,L)=>{b=b||Pu,y=y||Pu;const M=b.length,G=y.length,U=Math.min(M,G);let K;for(K=0;K<U;K++){const Y=y[K]=L?Jt(y[K]):We(y[K]);_(b[K],Y,D,null,F,O,V,B,L)}M>G?sn(b,F,O,!0,!1,U):w(y,D,N,F,O,V,B,L,U)},je=(b,y,D,N,F,O,V,B,L)=>{let M=0;const G=y.length;let U=b.length-1,K=G-1;for(;M<=U&&M<=K;){const Y=b[M],ue=y[M]=L?Jt(y[M]):We(y[M]);if(mt(Y,ue))_(Y,ue,D,null,F,O,V,B,L);else break;M++}for(;M<=U&&M<=K;){const Y=b[U],ue=y[K]=L?Jt(y[K]):We(y[K]);if(mt(Y,ue))_(Y,ue,D,null,F,O,V,B,L);else break;U--,K--}if(M>U){if(M<=K){const Y=K+1,ue=Y<G?y[Y].el:N;for(;M<=K;)_(null,y[M]=L?Jt(y[M]):We(y[M]),D,ue,F,O,V,B,L),M++}}else if(M>K)for(;M<=U;)yt(b[M],F,O,!0),M++;else{const Y=M,ue=M,ce=new Map;for(M=ue;M<=K;M++){const Qe=y[M]=L?Jt(y[M]):We(y[M]);Qe.key!=null&&ce.set(Qe.key,M)}let se,Ye=0;const Ie=K-ue+1;let ht=!1,Xe=0;const on=new Array(Ie);for(M=0;M<Ie;M++)on[M]=0;for(M=Y;M<=U;M++){const Qe=b[M];if(Ye>=Ie){yt(Qe,F,O,!0);continue}let xt;if(Qe.key!=null)xt=ce.get(Qe.key);else for(se=ue;se<=K;se++)if(on[se-ue]===0&&mt(Qe,y[se])){xt=se;break}xt===void 0?yt(Qe,F,O,!0):(on[xt-ue]=M+1,xt>=Xe?Xe=xt:ht=!0,_(Qe,y[xt],D,null,F,O,V,B,L),Ye++)}const jo=ht?T1(on):Pu;for(se=jo.length-1,M=Ie-1;M>=0;M--){const Qe=ue+M,xt=y[Qe],zo=Qe+1<G?y[Qe+1].el:N;on[M]===0?_(null,xt,D,zo,F,O,V,B,L):ht&&(se<0||M!==jo[se]?dt(xt,D,zo,2):se--)}}},dt=(b,y,D,N,F=null)=>{const{el:O,type:V,transition:B,children:L,shapeFlag:M}=b;if(M&6){dt(b.component.subTree,y,D,N);return}if(M&128){b.suspense.move(y,D,N);return}if(M&64){V.move(b,y,D,Iu);return}if(V===Ee){n(O,y,D);for(let U=0;U<L.length;U++)dt(L[U],y,D,N);n(b.anchor,y,D);return}if(V===ru){m(b,y,D);return}if(N!==2&&M&1&&B)if(N===0)B.beforeEnter(O),n(O,y,D),ke(()=>B.enter(O),F);else{const{leave:U,delayLeave:K,afterLeave:Y}=B,ue=()=>n(O,y,D),ce=()=>{U(O,()=>{ue(),Y&&Y()})};K?K(O,ue,ce):ce()}else n(O,y,D)},yt=(b,y,D,N=!1,F=!1)=>{const{type:O,props:V,ref:B,children:L,dynamicChildren:M,shapeFlag:G,patchFlag:U,dirs:K,cacheIndex:Y}=b;if(U===-2&&(F=!1),B!=null&&On(B,null,D,b,!0),Y!=null&&(y.renderCache[Y]=void 0),G&256){y.ctx.deactivate(b);return}const ue=G&1&&K,ce=!nu(b);let se;if(ce&&(se=V&&V.onVnodeBeforeUnmount)&&Ke(se,y,b),G&6)Hf(b.component,D,N);else{if(G&128){b.suspense.unmount(D,N);return}ue&&kt(b,null,y,"beforeUnmount"),G&64?b.type.remove(b,y,D,Iu,N):M&&!M.hasOnce&&(O!==Ee||U>0&&U&64)?sn(M,y,D,!1,!0):(O===Ee&&U&384||!F&&G&16)&&sn(L,y,D),N&&Uo(b)}(ce&&(se=V&&V.onVnodeUnmounted)||ue)&&ke(()=>{se&&Ke(se,y,b),ue&&kt(b,null,y,"unmounted")},D)},Uo=b=>{const{type:y,el:D,anchor:N,transition:F}=b;if(y===Ee){$f(D,N);return}if(y===ru){g(b);return}const O=()=>{r(D),F&&!F.persisted&&F.afterLeave&&F.afterLeave()};if(b.shapeFlag&1&&F&&!F.persisted){const{leave:V,delayLeave:B}=F,L=()=>V(D,O);B?B(b.el,O,L):L()}else O()},$f=(b,y)=>{let D;for(;b!==y;)D=p(b),r(b),b=D;r(y)},Hf=(b,y,D)=>{const{bum:N,scope:F,job:O,subTree:V,um:B,m:L,a:M}=b;Or(L),Or(M),N&&Vu(N),F.stop(),O&&(O.flags|=8,yt(V,b,y,D)),B&&ke(B,y),ke(()=>{b.isUnmounted=!0},y),y&&y.pendingBranch&&!y.isUnmounted&&b.asyncDep&&!b.asyncResolved&&b.suspenseId===y.pendingId&&(y.deps--,y.deps===0&&y.resolve()),__VUE_PROD_DEVTOOLS__&&bh(b)},sn=(b,y,D,N=!1,F=!1,O=0)=>{for(let V=O;V<b.length;V++)yt(b[V],y,D,N,F)},nr=b=>{if(b.shapeFlag&6)return nr(b.component.subTree);if(b.shapeFlag&128)return b.suspense.next();const y=p(b.anchor||b.el),D=y&&y[Xl];return D?p(D):y};let Fi=!1;const qo=(b,y,D)=>{b==null?y._vnode&&yt(y._vnode,null,null,!0):_(y._vnode||null,b,y,null,null,null,D),y._vnode=b,Fi||(Fi=!0,Yo(),Fr(),Fi=!1)},Iu={p:_,um:yt,m:dt,r:Uo,mt:q,mc:w,pc:j,pbc:S,n:nr,o:e};let Ii,Oi;return t&&([Ii,Oi]=t(Iu)),{render:qo,hydrate:Ii,createApp:y1(qo,Ii)}}function Ui({type:e,props:t},u){return u==="svg"&&e==="foreignObject"||u==="mathml"&&e==="annotation-xml"&&t&&t.encoding&&t.encoding.includes("html")?void 0:u}function fu({effect:e,job:t},u){u?(e.flags|=32,t.flags|=4):(e.flags&=-33,t.flags&=-5)}function Na(e,t){return(!e||e&&!e.pendingBranch)&&t&&!t.persisted}function io(e,t,u=!1){const n=e.children,r=t.children;if($(n)&&$(r))for(let i=0;i<n.length;i++){const s=n[i];let o=r[i];o.shapeFlag&1&&!o.dynamicChildren&&((o.patchFlag<=0||o.patchFlag===32)&&(o=r[i]=Jt(r[i]),o.el=s.el),!u&&o.patchFlag!==-2&&io(s,o)),o.type===Pt&&(o.el=s.el)}}function T1(e){const t=e.slice(),u=[0];let n,r,i,s,o;const c=e.length;for(n=0;n<c;n++){const l=e[n];if(l!==0){if(r=u[u.length-1],e[r]<l){t[n]=r,u.push(n);continue}for(i=0,s=u.length-1;i<s;)o=i+s>>1,e[u[o]]<l?i=o+1:s=o;l<e[u[i]]&&(i>0&&(t[n]=u[i-1]),u[i]=n)}}for(i=u.length,s=u[i-1];i-- >0;)u[i]=s,s=t[s];return u}function Ra(e){const t=e.subTree.component;if(t)return t.asyncDep&&!t.asyncResolved?t:Ra(t)}function Or(e){if(e)for(let t=0;t<e.length;t++)e[t].flags|=8}const Ma=Symbol.for("v-scx"),Pa=()=>Cn(Ma);function w1(e,t){return Zn(e,null,t)}function F1(e,t){return Zn(e,null,{flush:"post"})}function La(e,t){return Zn(e,null,{flush:"sync"})}function Hu(e,t,u){return Zn(e,t,u)}function Zn(e,t,u=ee){const{immediate:n,deep:r,flush:i,once:s}=u,o=te({},u),c=t&&n||!t&&i!=="post";let l;if(Ku){if(i==="sync"){const d=Pa();l=d.__watcherHandles||(d.__watcherHandles=[])}else if(!c){const d=()=>{};return d.stop=we,d.resume=we,d.pause=we,d}}const a=Se;o.call=(d,h,_)=>lt(d,a,h,_);let f=!1;i==="post"?o.scheduler=d=>{ke(d,a&&a.suspense)}:i!=="sync"&&(f=!0,o.scheduler=(d,h)=>{h?d():Ks(d)}),o.augmentJob=d=>{t&&(d.flags|=4),f&&(d.flags|=2,a&&(d.id=a.uid,d.i=a))};const p=th(e,t,o);return Ku&&(l?l.push(p):c&&p()),p}function I1(e,t,u){const n=this.proxy,r=Z(e)?e.includes(".")?Ba(n,e):()=>n[e]:e.bind(n,n);let i;W(t)?i=t:(i=t.handler,u=t);const s=Au(this),o=Zn(r,i.bind(n),u);return s(),o}function Ba(e,t){const u=t.split(".");return()=>{let n=e;for(let r=0;r<u.length&&n;r++)n=n[u[r]];return n}}function O1(e,t,u=ee){const n=ft(),r=de(t),i=qe(t),s=Va(e,r),o=jl((c,l)=>{let a,f=ee,p;return La(()=>{const d=e[r];He(a,d)&&(a=d,l())}),{get(){return c(),u.get?u.get(a):a},set(d){const h=u.set?u.set(d):d;if(!He(h,a)&&!(f!==ee&&He(d,f)))return;const _=n.vnode.props;_&&(t in _||r in _||i in _)&&(`onUpdate:${t}`in _||`onUpdate:${r}`in _||`onUpdate:${i}`in _)||(a=d,l()),n.emit(`update:${t}`,h),He(d,h)&&He(d,f)&&!He(h,p)&&l(),f=d,p=h}}});return o[Symbol.iterator]=()=>{let c=0;return{next(){return c<2?{value:c++?s||ee:o,done:!1}:{done:!0}}}},o}const Va=(e,t)=>t==="modelValue"||t==="model-value"?e.modelModifiers:e[`${t}Modifiers`]||e[`${de(t)}Modifiers`]||e[`${qe(t)}Modifiers`];function N1(e,t,...u){if(e.isUnmounted)return;const n=e.vnode.props||ee;let r=u;const i=t.startsWith("update:"),s=i&&Va(n,t.slice(7));s&&(s.trim&&(r=u.map(a=>Z(a)?a.trim():a)),s.number&&(r=u.map(Sr))),__VUE_PROD_DEVTOOLS__&&gh(e,t,r);let o,c=n[o=Bu(t)]||n[o=Bu(de(t))];!c&&i&&(c=n[o=Bu(qe(t))]),c&&lt(c,e,6,r);const l=n[o+"Once"];if(l){if(!e.emitted)e.emitted={};else if(e.emitted[o])return;e.emitted[o]=!0,lt(l,e,6,r)}}function $a(e,t,u=!1){const n=t.emitsCache,r=n.get(e);if(r!==void 0)return r;const i=e.emits;let s={},o=!1;if(__VUE_OPTIONS_API__&&!W(e)){const c=l=>{const a=$a(l,t,!0);a&&(o=!0,te(s,a))};!u&&t.mixins.length&&t.mixins.forEach(c),e.extends&&c(e.extends),e.mixins&&e.mixins.forEach(c)}return!i&&!o?(oe(e)&&n.set(e,null),null):($(i)?i.forEach(c=>s[c]=null):te(s,i),oe(e)&&n.set(e,s),s)}function yi(e,t){return!e||!vu(t)?!1:(t=t.slice(2).replace(/Once$/,""),re(e,t[0].toLowerCase()+t.slice(1))||re(e,qe(t))||re(e,t))}function _r(e){const{type:t,vnode:u,proxy:n,withProxy:r,propsOptions:[i],slots:s,attrs:o,emit:c,render:l,renderCache:a,props:f,data:p,setupState:d,ctx:h,inheritAttrs:_}=e,v=In(e);let A,E;try{if(u.shapeFlag&4){const g=r||n,x=g;A=We(l.call(x,g,a,f,d,p,h)),E=o}else{const g=t;A=We(g.length>1?g(f,{attrs:o,slots:s,emit:c}):g(f,null)),E=t.props?o:M1(o)}}catch(g){kn.length=0,Fu(g,e,1),A=fe(ye)}let m=A;if(E&&_!==!1){const g=Object.keys(E),{shapeFlag:x}=m;g.length&&x&7&&(i&&g.some(Rs)&&(E=P1(E,i)),m=vt(m,E,!1,!0))}return u.dirs&&(m=vt(m,null,!1,!0),m.dirs=m.dirs?m.dirs.concat(u.dirs):u.dirs),u.transition&&Bt(m,u.transition),A=m,In(v),A}function R1(e,t=!0){let u;for(let n=0;n<e.length;n++){const r=e[n];if(Vt(r)){if(r.type!==ye||r.children==="v-if"){if(u)return;u=r}}else return}return u}const M1=e=>{let t;for(const u in e)(u==="class"||u==="style"||vu(u))&&((t||(t={}))[u]=e[u]);return t},P1=(e,t)=>{const u={};for(const n in e)(!Rs(n)||!(n.slice(9)in t))&&(u[n]=e[n]);return u};function L1(e,t,u){const{props:n,children:r,component:i}=e,{props:s,children:o,patchFlag:c}=t,l=i.emitsOptions;if(t.dirs||t.transition)return!0;if(u&&c>=0){if(c&1024)return!0;if(c&16)return n?fc(n,s,l):!!s;if(c&8){const a=t.dynamicProps;for(let f=0;f<a.length;f++){const p=a[f];if(s[p]!==n[p]&&!yi(l,p))return!0}}}else return(r||o)&&(!o||!o.$stable)?!0:n===s?!1:n?s?fc(n,s,l):!0:!!s;return!1}function fc(e,t,u){const n=Object.keys(t);if(n.length!==Object.keys(e).length)return!0;for(let r=0;r<n.length;r++){const i=n[r];if(t[i]!==e[i]&&!yi(u,i))return!0}return!1}function xi({vnode:e,parent:t},u){for(;t;){const n=t.subTree;if(n.suspense&&n.suspense.activeBranch===e&&(n.el=e.el),n===e)(e=t.vnode).el=u,t=t.parent;else break}}const Nr=e=>e.__isSuspense;let hs=0;const B1={name:"Suspense",__isSuspense:!0,process(e,t,u,n,r,i,s,o,c,l){if(e==null)$1(t,u,n,r,i,s,o,c,l);else{if(i&&i.deps>0&&!e.suspense.isInFallback){t.suspense=e.suspense,t.suspense.vnode=t,t.el=e.el;return}H1(e,t,u,n,r,s,o,c,l)}},hydrate:U1,normalize:q1},V1=B1;function Rn(e,t){const u=e.props&&e.props[t];W(u)&&u()}function $1(e,t,u,n,r,i,s,o,c){const{p:l,o:{createElement:a}}=c,f=a("div"),p=e.suspense=Ha(e,r,n,t,f,u,i,s,o,c);l(null,p.pendingBranch=e.ssContent,f,null,n,p,i,s),p.deps>0?(Rn(e,"onPending"),Rn(e,"onFallback"),l(null,e.ssFallback,t,u,n,null,i,s),Uu(p,e.ssFallback)):p.resolve(!1,!0)}function H1(e,t,u,n,r,i,s,o,{p:c,um:l,o:{createElement:a}}){const f=t.suspense=e.suspense;f.vnode=t,t.el=e.el;const p=t.ssContent,d=t.ssFallback,{activeBranch:h,pendingBranch:_,isInFallback:v,isHydrating:A}=f;if(_)f.pendingBranch=p,mt(p,_)?(c(_,p,f.hiddenContainer,null,r,f,i,s,o),f.deps<=0?f.resolve():v&&(A||(c(h,d,u,n,r,null,i,s,o),Uu(f,d)))):(f.pendingId=hs++,A?(f.isHydrating=!1,f.activeBranch=_):l(_,r,f),f.deps=0,f.effects.length=0,f.hiddenContainer=a("div"),v?(c(null,p,f.hiddenContainer,null,r,f,i,s,o),f.deps<=0?f.resolve():(c(h,d,u,n,r,null,i,s,o),Uu(f,d))):h&&mt(p,h)?(c(h,p,u,n,r,f,i,s,o),f.resolve(!0)):(c(null,p,f.hiddenContainer,null,r,f,i,s,o),f.deps<=0&&f.resolve()));else if(h&&mt(p,h))c(h,p,u,n,r,f,i,s,o),Uu(f,p);else if(Rn(t,"onPending"),f.pendingBranch=p,p.shapeFlag&512?f.pendingId=p.component.suspenseId:f.pendingId=hs++,c(null,p,f.hiddenContainer,null,r,f,i,s,o),f.deps<=0)f.resolve();else{const{timeout:E,pendingId:m}=f;E>0?setTimeout(()=>{f.pendingId===m&&f.fallback(d)},E):E===0&&f.fallback(d)}}function Ha(e,t,u,n,r,i,s,o,c,l,a=!1){const{p:f,m:p,um:d,n:h,o:{parentNode:_,remove:v}}=l;let A;const E=j1(e);E&&t&&t.pendingBranch&&(A=t.pendingId,t.deps++);const m=e.props?vr(e.props.timeout):void 0,g=i,x={vnode:e,parent:t,parentComponent:u,namespace:s,container:n,hiddenContainer:r,deps:0,pendingId:hs++,timeout:typeof m=="number"?m:-1,activeBranch:null,pendingBranch:null,isInFallback:!a,isHydrating:a,isUnmounted:!1,effects:[],resolve(k=!1,I=!1){const{vnode:w,activeBranch:C,pendingBranch:S,pendingId:R,effects:T,parentComponent:P,container:q}=x;let z=!1;x.isHydrating?x.isHydrating=!1:k||(z=C&&S.transition&&S.transition.mode==="out-in",z&&(C.transition.afterLeave=()=>{R===x.pendingId&&(p(S,q,i===g?h(C):i,0),wn(T))}),C&&(_(C.el)===q&&(i=h(C)),d(C,P,x,!0)),z||p(S,q,i,0)),Uu(x,S),x.pendingBranch=null,x.isInFallback=!1;let H=x.parent,J=!1;for(;H;){if(H.pendingBranch){H.effects.push(...T),J=!0;break}H=H.parent}!J&&!z&&wn(T),x.effects=[],E&&t&&t.pendingBranch&&A===t.pendingId&&(t.deps--,t.deps===0&&!I&&t.resolve()),Rn(w,"onResolve")},fallback(k){if(!x.pendingBranch)return;const{vnode:I,activeBranch:w,parentComponent:C,container:S,namespace:R}=x;Rn(I,"onFallback");const T=h(w),P=()=>{x.isInFallback&&(f(null,k,S,T,C,null,R,o,c),Uu(x,k))},q=k.transition&&k.transition.mode==="out-in";q&&(w.transition.afterLeave=P),x.isInFallback=!0,d(w,C,null,!0),q||P()},move(k,I,w){x.activeBranch&&p(x.activeBranch,k,I,w),x.container=k},next(){return x.activeBranch&&h(x.activeBranch)},registerDep(k,I,w){const C=!!x.pendingBranch;C&&x.deps++;const S=k.vnode.el;k.asyncDep.catch(R=>{Fu(R,k,0)}).then(R=>{if(k.isUnmounted||x.isUnmounted||x.pendingId!==k.suspenseId)return;k.asyncResolved=!0;const{vnode:T}=k;gs(k,R,!1),S&&(T.el=S);const P=!S&&k.subTree.el;I(k,T,_(S||k.subTree.el),S?null:h(k.subTree),x,s,w),P&&v(P),xi(k,T.el),C&&--x.deps===0&&x.resolve()})},unmount(k,I){x.isUnmounted=!0,x.activeBranch&&d(x.activeBranch,u,k,I),x.pendingBranch&&d(x.pendingBranch,u,k,I)}};return x}function U1(e,t,u,n,r,i,s,o,c){const l=t.suspense=Ha(t,n,u,e.parentNode,document.createElement("div"),null,r,i,s,o,!0),a=c(e,l.pendingBranch=t.ssContent,u,l,i,s);return l.deps===0&&l.resolve(!1,!0),a}function q1(e){const{shapeFlag:t,children:u}=e,n=t&32;e.ssContent=dc(n?u.default:u),e.ssFallback=n?dc(u.fallback):fe(ye)}function dc(e){let t;if(W(e)){const u=ku&&e._c;u&&(e._d=!1,Mn()),e=e(),u&&(e._d=!0,t=Pe,qa())}return $(e)&&(e=R1(e)),e=We(e),t&&!e.dynamicChildren&&(e.dynamicChildren=t.filter(u=>u!==e)),e}function Ua(e,t){t&&t.pendingBranch?$(e)?t.effects.push(...e):t.effects.push(e):wn(e)}function Uu(e,t){e.activeBranch=t;const{vnode:u,parentComponent:n}=e;let r=t.el;for(;!r&&t.component;)t=t.component.subTree,r=t.el;u.el=r,n&&n.subTree===u&&(n.vnode.el=r,xi(n,r))}function j1(e){const t=e.props&&e.props.suspensible;return t!=null&&t!==!1}const Ee=Symbol.for("v-fgt"),Pt=Symbol.for("v-txt"),ye=Symbol.for("v-cmt"),ru=Symbol.for("v-stc"),kn=[];let Pe=null;function Mn(e=!1){kn.push(Pe=e?null:[])}function qa(){kn.pop(),Pe=kn[kn.length-1]||null}let ku=1;function ps(e,t=!1){ku+=e,e<0&&Pe&&t&&(Pe.hasOnce=!0)}function ja(e){return e.dynamicChildren=ku>0?Pe||Pu:null,qa(),ku>0&&Pe&&Pe.push(e),e}function z1(e,t,u,n,r,i){return ja(so(e,t,u,n,r,i,!0))}function Rr(e,t,u,n,r){return ja(fe(e,t,u,n,r,!0))}function Vt(e){return e?e.__v_isVNode===!0:!1}function mt(e,t){return e.type===t.type&&e.key===t.key}function K1(e){}const za=({key:e})=>e??null,yr=({ref:e,ref_key:t,ref_for:u})=>(typeof e=="number"&&(e=""+e),e!=null?Z(e)||Ce(e)||W(e)?{i:ve,r:e,k:t,f:!!u}:e:null);function so(e,t=null,u=null,n=0,r=null,i=e===Ee?0:1,s=!1,o=!1){const c={__v_isVNode:!0,__v_skip:!0,type:e,props:t,key:t&&za(t),ref:t&&yr(t),scopeId:hi,slotScopeIds:null,children:u,component:null,suspense:null,ssContent:null,ssFallback:null,dirs:null,transition:null,el:null,anchor:null,target:null,targetStart:null,targetAnchor:null,staticCount:0,shapeFlag:i,patchFlag:n,dynamicProps:r,dynamicChildren:null,appContext:null,ctx:ve};return o?(co(c,u),i&128&&e.normalize(c)):u&&(c.shapeFlag|=Z(u)?8:16),ku>0&&!s&&Pe&&(c.patchFlag>0||i&6)&&c.patchFlag!==32&&Pe.push(c),c}const fe=W1;function W1(e,t=null,u=null,n=0,r=null,i=!1){if((!e||e===ba)&&(e=ye),Vt(e)){const o=vt(e,t,!0);return u&&co(o,u),ku>0&&!i&&Pe&&(o.shapeFlag&6?Pe[Pe.indexOf(e)]=o:Pe.push(o)),o.patchFlag=-2,o}if(np(e)&&(e=e.__vccOpts),t){t=Ka(t);let{class:o,style:c}=t;o&&!Z(o)&&(t.class=nn(o)),oe(c)&&(li(c)&&!$(c)&&(c=te({},c)),t.style=un(c))}const s=Z(e)?1:Nr(e)?128:Ql(e)?64:oe(e)?4:W(e)?2:0;return so(e,t,u,n,r,s,i,!0)}function Ka(e){return e?li(e)||ka(e)?te({},e):e:null}function vt(e,t,u=!1,n=!1){const{props:r,ref:i,patchFlag:s,children:o,transition:c}=e,l=t?Wa(r||{},t):r,a={__v_isVNode:!0,__v_skip:!0,type:e.type,props:l,key:l&&za(l),ref:t&&t.ref?u&&i?$(i)?i.concat(yr(t)):[i,yr(t)]:yr(t):i,scopeId:e.scopeId,slotScopeIds:e.slotScopeIds,children:o,target:e.target,targetStart:e.targetStart,targetAnchor:e.targetAnchor,staticCount:e.staticCount,shapeFlag:e.shapeFlag,patchFlag:t&&e.type!==Ee?s===-1?16:s|16:s,dynamicProps:e.dynamicProps,dynamicChildren:e.dynamicChildren,appContext:e.appContext,dirs:e.dirs,transition:c,component:e.component,suspense:e.suspense,ssContent:e.ssContent&&vt(e.ssContent),ssFallback:e.ssFallback&&vt(e.ssFallback),el:e.el,anchor:e.anchor,ctx:e.ctx,ce:e.ce};return c&&n&&Bt(a,c.clone(a)),a}function oo(e=" ",t=0){return fe(Pt,null,e,t)}function G1(e,t){const u=fe(ru,null,e);return u.staticCount=t,u}function Z1(e="",t=!1){return t?(Mn(),Rr(ye,null,e)):fe(ye,null,e)}function We(e){return e==null||typeof e=="boolean"?fe(ye):$(e)?fe(Ee,null,e.slice()):Vt(e)?Jt(e):fe(Pt,null,String(e))}function Jt(e){return e.el===null&&e.patchFlag!==-1||e.memo?e:vt(e)}function co(e,t){let u=0;const{shapeFlag:n}=e;if(t==null)t=null;else if($(t))u=16;else if(typeof t=="object")if(n&65){const r=t.default;r&&(r._c&&(r._d=!1),co(e,r()),r._c&&(r._d=!0));return}else{u=32;const r=t._;!r&&!ka(t)?t._ctx=ve:r===3&&ve&&(ve.slots._===1?t._=1:(t._=2,e.patchFlag|=1024))}else W(t)?(t={default:t,_ctx:ve},u=32):(t=String(t),n&64?(u=16,t=[oo(t)]):u=8);e.children=t,e.shapeFlag|=u}function Wa(...e){const t={};for(let u=0;u<e.length;u++){const n=e[u];for(const r in n)if(r==="class")t.class!==n.class&&(t.class=nn([t.class,n.class]));else if(r==="style")t.style=un([t.style,n.style]);else if(vu(r)){const i=t[r],s=n[r];s&&i!==s&&!($(i)&&i.includes(s))&&(t[r]=i?[].concat(i,s):s)}else r!==""&&(t[r]=n[r])}return t}function Ke(e,t,u,n=null){lt(e,t,7,[u,n])}const J1=ya();let Y1=0;function Ga(e,t,u){const n=e.type,r=(t?t.appContext:e.appContext)||J1,i={uid:Y1++,vnode:e,type:n,parent:t,appContext:r,root:null,next:null,subTree:null,effect:null,update:null,job:null,scope:new Vs(!0),render:null,proxy:null,exposed:null,exposeProxy:null,withProxy:null,provides:t?t.provides:Object.create(r.provides),ids:t?t.ids:["",0,0],accessCache:null,renderCache:[],components:null,directives:null,propsOptions:Sa(n,r),emitsOptions:$a(n,r),emit:null,emitted:null,propsDefaults:ee,inheritAttrs:n.inheritAttrs,ctx:ee,data:ee,props:ee,attrs:ee,slots:ee,refs:ee,setupState:ee,setupContext:null,suspense:u,suspenseId:u?u.pendingId:0,asyncDep:null,asyncResolved:!1,isMounted:!1,isUnmounted:!1,isDeactivated:!1,bc:null,c:null,bm:null,m:null,bu:null,u:null,um:null,bum:null,da:null,a:null,rtg:null,rtc:null,ec:null,sp:null};return i.ctx={_:i},i.root=t?t.root:i,i.emit=N1.bind(null,i),e.ce&&e.ce(i),i}let Se=null;const ft=()=>Se||ve;let Mr,ms;{const e=gu(),t=(u,n)=>{let r;return(r=e[u])||(r=e[u]=[]),r.push(n),i=>{r.length>1?r.forEach(s=>s(i)):r[0](i)}};Mr=t("__VUE_INSTANCE_SETTERS__",u=>Se=u),ms=t("__VUE_SSR_SETTERS__",u=>Ku=u)}const Au=e=>{const t=Se;return Mr(e),e.scope.on(),()=>{e.scope.off(),Mr(t)}},bs=()=>{Se&&Se.scope.off(),Mr(null)};function Za(e){return e.vnode.shapeFlag&4}let Ku=!1;function Ja(e,t=!1,u=!1){t&&ms(t);const{props:n,children:r}=e.vnode,i=Za(e);E1(e,n,i,t),S1(e,r,u);const s=i?X1(e,t):void 0;return t&&ms(!1),s}function X1(e,t){const u=e.type;e.accessCache=Object.create(null),e.proxy=new Proxy(e.ctx,ls);const{setup:n}=u;if(n){$t();const r=e.setupContext=n.length>1?Qa(e):null,i=Au(e),s=wu(n,e,0,[e.props,r]),o=Ps(s);if(Ht(),i(),(o||e.sp)&&!nu(e)&&Xs(e),o){if(s.then(bs,bs),t)return s.then(c=>{gs(e,c,t)}).catch(c=>{Fu(c,e,0)});e.asyncDep=s}else gs(e,s,t)}else Xa(e,t)}function gs(e,t,u){W(t)?e.type.__ssrInlineRender?e.ssrRender=t:e.render=t:oe(t)&&(__VUE_PROD_DEVTOOLS__&&(e.devtoolsRawSetupState=t),e.setupState=zs(t)),Xa(e,u)}let Pr,_s;function Ya(e){Pr=e,_s=t=>{t.render._rc&&(t.withProxy=new Proxy(t.ctx,Qh))}}const Q1=()=>!Pr;function Xa(e,t,u){const n=e.type;if(!e.render){if(!t&&Pr&&!n.render){const r=n.template||__VUE_OPTIONS_API__&&no(e).template;if(r){const{isCustomElement:i,compilerOptions:s}=e.appContext.config,{delimiters:o,compilerOptions:c}=n,l=te(te({isCustomElement:i,delimiters:o},s),c);n.render=Pr(r,l)}}e.render=n.render||we,_s&&_s(e)}if(__VUE_OPTIONS_API__){const r=Au(e);$t();try{h1(e)}finally{Ht(),r()}}}const ep={get(e,t){return Re(e,"get",""),e[t]}};function Qa(e){const t=u=>{e.exposed=u||{}};return{attrs:new Proxy(e.attrs,ep),slots:e.slots,emit:e.emit,expose:t}}function Jn(e){return e.exposed?e.exposeProxy||(e.exposeProxy=new Proxy(zs(Ul(e.exposed)),{get(t,u){if(u in t)return t[u];if(u in En)return En[u](e)},has(t,u){return u in t||u in En}})):e.proxy}const tp=/(?:^|[-_])(\w)/g,up=e=>e.replace(tp,t=>t.toUpperCase()).replace(/[-_]/g,"");function Lr(e,t=!0){return W(e)?e.displayName||e.name:e.name||t&&e.__name}function e0(e,t,u=!1){let n=Lr(t);if(!n&&t.__file){const r=t.__file.match(/([^/\\]+)\.\w+$/);r&&(n=r[1])}if(!n&&e&&e.parent){const r=i=>{for(const s in i)if(i[s]===t)return s};n=r(e.components||e.parent.type.components)||r(e.appContext.components)}return n?up(n):u?"App":"Anonymous"}function np(e){return W(e)&&"__vccOpts"in e}const lo=(e,t)=>Yd(e,t,Ku);function ao(e,t,u){const n=arguments.length;return n===2?oe(t)&&!$(t)?Vt(t)?fe(e,null,[t]):fe(e,t):fe(e,null,t):(n>3?u=Array.prototype.slice.call(arguments,2):n===3&&Vt(u)&&(u=[u]),fe(e,t,u))}function rp(){}function ip(e,t,u,n){const r=u[n];if(r&&t0(r,e))return r;const i=t();return i.memo=e.slice(),i.cacheIndex=n,u[n]=i}function t0(e,t){const u=e.memo;if(u.length!=t.length)return!1;for(let n=0;n<u.length;n++)if(He(u[n],t[n]))return!1;return ku>0&&Pe&&Pe.push(e),!0}const ys="3.5.13",sp=we,op=ah,cp=pt,lp=Ws,ap={createComponentInstance:Ga,setupComponent:Ja,renderComponentRoot:_r,setCurrentRenderingInstance:In,isVNode:Vt,normalizeVNode:We,getComponentPublicInstance:Jn,ensureValidVNode:uo,pushWarningContext:uh,popWarningContext:nh},fp=ap,dp=null,hp=null,pp=null;/**
-* @vue/runtime-dom v3.5.13
-* (c) 2018-present Yuxi (Evan) You and Vue contributors
-* @license MIT
-**/let xs;const hc=typeof window<"u"&&window.trustedTypes;if(hc)try{xs=hc.createPolicy("vue",{createHTML:e=>e})}catch{}const u0=xs?e=>xs.createHTML(e):e=>e,mp="http://www.w3.org/2000/svg",bp="http://www.w3.org/1998/Math/MathML",Ot=typeof document<"u"?document:null,pc=Ot&&Ot.createElement("template"),gp={insert:(e,t,u)=>{t.insertBefore(e,u||null)},remove:e=>{const t=e.parentNode;t&&t.removeChild(e)},createElement:(e,t,u,n)=>{const r=t==="svg"?Ot.createElementNS(mp,e):t==="mathml"?Ot.createElementNS(bp,e):u?Ot.createElement(e,{is:u}):Ot.createElement(e);return e==="select"&&n&&n.multiple!=null&&r.setAttribute("multiple",n.multiple),r},createText:e=>Ot.createTextNode(e),createComment:e=>Ot.createComment(e),setText:(e,t)=>{e.nodeValue=t},setElementText:(e,t)=>{e.textContent=t},parentNode:e=>e.parentNode,nextSibling:e=>e.nextSibling,querySelector:e=>Ot.querySelector(e),setScopeId(e,t){e.setAttribute(t,"")},insertStaticContent(e,t,u,n,r,i){const s=u?u.previousSibling:t.lastChild;if(r&&(r===i||r.nextSibling))for(;t.insertBefore(r.cloneNode(!0),u),!(r===i||!(r=r.nextSibling)););else{pc.innerHTML=u0(n==="svg"?`<svg>${e}</svg>`:n==="mathml"?`<math>${e}</math>`:e);const o=pc.content;if(n==="svg"||n==="mathml"){const c=o.firstChild;for(;c.firstChild;)o.appendChild(c.firstChild);o.removeChild(c)}t.insertBefore(o,u)}return[s?s.nextSibling:t.firstChild,u?u.previousSibling:t.lastChild]}},qt="transition",ln="animation",Wu=Symbol("_vtc"),n0={name:String,type:String,css:{type:Boolean,default:!0},duration:[String,Number,Object],enterFromClass:String,enterActiveClass:String,enterToClass:String,appearFromClass:String,appearActiveClass:String,appearToClass:String,leaveFromClass:String,leaveActiveClass:String,leaveToClass:String},r0=te({},Ys,n0),_p=e=>(e.displayName="Transition",e.props=r0,e),yp=_p((e,{slots:t})=>ao(ra,i0(e),t)),du=(e,t=[])=>{$(e)?e.forEach(u=>u(...t)):e&&e(...t)},mc=e=>e?$(e)?e.some(t=>t.length>1):e.length>1:!1;function i0(e){const t={};for(const T in e)T in n0||(t[T]=e[T]);if(e.css===!1)return t;const{name:u="v",type:n,duration:r,enterFromClass:i=`${u}-enter-from`,enterActiveClass:s=`${u}-enter-active`,enterToClass:o=`${u}-enter-to`,appearFromClass:c=i,appearActiveClass:l=s,appearToClass:a=o,leaveFromClass:f=`${u}-leave-from`,leaveActiveClass:p=`${u}-leave-active`,leaveToClass:d=`${u}-leave-to`}=e,h=xp(r),_=h&&h[0],v=h&&h[1],{onBeforeEnter:A,onEnter:E,onEnterCancelled:m,onLeave:g,onLeaveCancelled:x,onBeforeAppear:k=A,onAppear:I=E,onAppearCancelled:w=m}=t,C=(T,P,q,z)=>{T._enterCancelled=z,Kt(T,P?a:o),Kt(T,P?l:s),q&&q()},S=(T,P)=>{T._isLeaving=!1,Kt(T,f),Kt(T,d),Kt(T,p),P&&P()},R=T=>(P,q)=>{const z=T?I:E,H=()=>C(P,T,q);du(z,[P,H]),bc(()=>{Kt(P,T?c:i),Et(P,T?a:o),mc(z)||gc(P,n,_,H)})};return te(t,{onBeforeEnter(T){du(A,[T]),Et(T,i),Et(T,s)},onBeforeAppear(T){du(k,[T]),Et(T,c),Et(T,l)},onEnter:R(!1),onAppear:R(!0),onLeave(T,P){T._isLeaving=!0;const q=()=>S(T,P);Et(T,f),T._enterCancelled?(Et(T,p),Es()):(Es(),Et(T,p)),bc(()=>{T._isLeaving&&(Kt(T,f),Et(T,d),mc(g)||gc(T,n,v,q))}),du(g,[T,q])},onEnterCancelled(T){C(T,!1,void 0,!0),du(m,[T])},onAppearCancelled(T){C(T,!0,void 0,!0),du(w,[T])},onLeaveCancelled(T){S(T),du(x,[T])}})}function xp(e){if(e==null)return null;if(oe(e))return[qi(e.enter),qi(e.leave)];{const t=qi(e);return[t,t]}}function qi(e){return vr(e)}function Et(e,t){t.split(/\s+/).forEach(u=>u&&e.classList.add(u)),(e[Wu]||(e[Wu]=new Set)).add(t)}function Kt(e,t){t.split(/\s+/).forEach(n=>n&&e.classList.remove(n));const u=e[Wu];u&&(u.delete(t),u.size||(e[Wu]=void 0))}function bc(e){requestAnimationFrame(()=>{requestAnimationFrame(e)})}let Ep=0;function gc(e,t,u,n){const r=e._endId=++Ep,i=()=>{r===e._endId&&n()};if(u!=null)return setTimeout(i,u);const{type:s,timeout:o,propCount:c}=s0(e,t);if(!s)return n();const l=s+"end";let a=0;const f=()=>{e.removeEventListener(l,p),i()},p=d=>{d.target===e&&++a>=c&&f()};setTimeout(()=>{a<c&&f()},o+1),e.addEventListener(l,p)}function s0(e,t){const u=window.getComputedStyle(e),n=h=>(u[h]||"").split(", "),r=n(`${qt}Delay`),i=n(`${qt}Duration`),s=_c(r,i),o=n(`${ln}Delay`),c=n(`${ln}Duration`),l=_c(o,c);let a=null,f=0,p=0;t===qt?s>0&&(a=qt,f=s,p=i.length):t===ln?l>0&&(a=ln,f=l,p=c.length):(f=Math.max(s,l),a=f>0?s>l?qt:ln:null,p=a?a===qt?i.length:c.length:0);const d=a===qt&&/\b(transform|all)(,|$)/.test(n(`${qt}Property`).toString());return{type:a,timeout:f,propCount:p,hasTransform:d}}function _c(e,t){for(;e.length<t.length;)e=e.concat(e);return Math.max(...t.map((u,n)=>yc(u)+yc(e[n])))}function yc(e){return e==="auto"?0:Number(e.slice(0,-1).replace(",","."))*1e3}function Es(){return document.body.offsetHeight}function Cp(e,t,u){const n=e[Wu];n&&(t=(t?[t,...n]:[...n]).join(" ")),t==null?e.removeAttribute("class"):u?e.setAttribute("class",t):e.className=t}const Br=Symbol("_vod"),o0=Symbol("_vsh"),c0={beforeMount(e,{value:t},{transition:u}){e[Br]=e.style.display==="none"?"":e.style.display,u&&t?u.beforeEnter(e):an(e,t)},mounted(e,{value:t},{transition:u}){u&&t&&u.enter(e)},updated(e,{value:t,oldValue:u},{transition:n}){!t!=!u&&(n?t?(n.beforeEnter(e),an(e,!0),n.enter(e)):n.leave(e,()=>{an(e,!1)}):an(e,t))},beforeUnmount(e,{value:t}){an(e,t)}};function an(e,t){e.style.display=t?e[Br]:"none",e[o0]=!t}function kp(){c0.getSSRProps=({value:e})=>{if(!e)return{style:{display:"none"}}}}const l0=Symbol("");function Ap(e){const t=ft();if(!t)return;const u=t.ut=(r=e(t.proxy))=>{Array.from(document.querySelectorAll(`[data-v-owner="${t.uid}"]`)).forEach(i=>Vr(i,r))},n=()=>{const r=e(t.proxy);t.ce?Vr(t.ce,r):Cs(t.subTree,r),u(r)};Qs(()=>{wn(n)}),Gn(()=>{Hu(n,we,{flush:"post"});const r=new MutationObserver(n);r.observe(t.subTree.el.parentNode,{childList:!0}),_i(()=>r.disconnect())})}function Cs(e,t){if(e.shapeFlag&128){const u=e.suspense;e=u.activeBranch,u.pendingBranch&&!u.isHydrating&&u.effects.push(()=>{Cs(u.activeBranch,t)})}for(;e.component;)e=e.component.subTree;if(e.shapeFlag&1&&e.el)Vr(e.el,t);else if(e.type===Ee)e.children.forEach(u=>Cs(u,t));else if(e.type===ru){let{el:u,anchor:n}=e;for(;u&&(Vr(u,t),u!==n);)u=u.nextSibling}}function Vr(e,t){if(e.nodeType===1){const u=e.style;let n="";for(const r in t)u.setProperty(`--${r}`,t[r]),n+=`--${r}: ${t[r]};`;u[l0]=n}}const Sp=/(^|;)\s*display\s*:/;function vp(e,t,u){const n=e.style,r=Z(u);let i=!1;if(u&&!r){if(t)if(Z(t))for(const s of t.split(";")){const o=s.slice(0,s.indexOf(":")).trim();u[o]==null&&xr(n,o,"")}else for(const s in t)u[s]==null&&xr(n,s,"");for(const s in u)s==="display"&&(i=!0),xr(n,s,u[s])}else if(r){if(t!==u){const s=n[l0];s&&(u+=";"+s),n.cssText=u,i=Sp.test(u)}}else t&&e.removeAttribute("style");Br in e&&(e[Br]=i?n.display:"",e[o0]&&(n.display="none"))}const xc=/\s*!important$/;function xr(e,t,u){if($(u))u.forEach(n=>xr(e,t,n));else if(u==null&&(u=""),t.startsWith("--"))e.setProperty(t,u);else{const n=Dp(e,t);xc.test(u)?e.setProperty(qe(n),u.replace(xc,""),"important"):e[n]=u}}const Ec=["Webkit","Moz","ms"],ji={};function Dp(e,t){const u=ji[t];if(u)return u;let n=de(t);if(n!=="filter"&&n in e)return ji[t]=n;n=Tu(n);for(let r=0;r<Ec.length;r++){const i=Ec[r]+n;if(i in e)return ji[t]=i}return t}const Cc="http://www.w3.org/1999/xlink";function kc(e,t,u,n,r,i=ad(t)){n&&t.startsWith("xlink:")?u==null?e.removeAttributeNS(Cc,t.slice(6,t.length)):e.setAttributeNS(Cc,t,u):u==null||i&&!Bs(u)?e.removeAttribute(t):e.setAttribute(t,i?"":Ze(u)?String(u):u)}function Ac(e,t,u,n,r){if(t==="innerHTML"||t==="textContent"){u!=null&&(e[t]=t==="innerHTML"?u0(u):u);return}const i=e.tagName;if(t==="value"&&i!=="PROGRESS"&&!i.includes("-")){const o=i==="OPTION"?e.getAttribute("value")||"":e.value,c=u==null?e.type==="checkbox"?"on":"":String(u);(o!==c||!("_value"in e))&&(e.value=c),u==null&&e.removeAttribute(t),e._value=u;return}let s=!1;if(u===""||u==null){const o=typeof e[t];o==="boolean"?u=Bs(u):u==null&&o==="string"?(u="",s=!0):o==="number"&&(u=0,s=!0)}try{e[t]=u}catch{}s&&e.removeAttribute(r||t)}function Mt(e,t,u,n){e.addEventListener(t,u,n)}function Tp(e,t,u,n){e.removeEventListener(t,u,n)}const Sc=Symbol("_vei");function wp(e,t,u,n,r=null){const i=e[Sc]||(e[Sc]={}),s=i[t];if(n&&s)s.value=n;else{const[o,c]=Fp(t);if(n){const l=i[t]=Np(n,r);Mt(e,o,l,c)}else s&&(Tp(e,o,s,c),i[t]=void 0)}}const vc=/(?:Once|Passive|Capture)$/;function Fp(e){let t;if(vc.test(e)){t={};let n;for(;n=e.match(vc);)e=e.slice(0,e.length-n[0].length),t[n[0].toLowerCase()]=!0}return[e[2]===":"?e.slice(3):qe(e.slice(2)),t]}let zi=0;const Ip=Promise.resolve(),Op=()=>zi||(Ip.then(()=>zi=0),zi=Date.now());function Np(e,t){const u=n=>{if(!n._vts)n._vts=Date.now();else if(n._vts<=u.attached)return;lt(Rp(n,u.value),t,5,[n])};return u.value=e,u.attached=Op(),u}function Rp(e,t){if($(t)){const u=e.stopImmediatePropagation;return e.stopImmediatePropagation=()=>{u.call(e),e._stopped=!0},t.map(n=>r=>!r._stopped&&n&&n(r))}else return t}const Dc=e=>e.charCodeAt(0)===111&&e.charCodeAt(1)===110&&e.charCodeAt(2)>96&&e.charCodeAt(2)<123,Mp=(e,t,u,n,r,i)=>{const s=r==="svg";t==="class"?Cp(e,n,s):t==="style"?vp(e,u,n):vu(t)?Rs(t)||wp(e,t,u,n,i):(t[0]==="."?(t=t.slice(1),!0):t[0]==="^"?(t=t.slice(1),!1):Pp(e,t,n,s))?(Ac(e,t,n),!e.tagName.includes("-")&&(t==="value"||t==="checked"||t==="selected")&&kc(e,t,n,s,i,t!=="value")):e._isVueCE&&(/[A-Z]/.test(t)||!Z(n))?Ac(e,de(t),n,i,t):(t==="true-value"?e._trueValue=n:t==="false-value"&&(e._falseValue=n),kc(e,t,n,s))};function Pp(e,t,u,n){if(n)return!!(t==="innerHTML"||t==="textContent"||t in e&&Dc(t)&&W(u));if(t==="spellcheck"||t==="draggable"||t==="translate"||t==="form"||t==="list"&&e.tagName==="INPUT"||t==="type"&&e.tagName==="TEXTAREA")return!1;if(t==="width"||t==="height"){const r=e.tagName;if(r==="IMG"||r==="VIDEO"||r==="CANVAS"||r==="SOURCE")return!1}return Dc(t)&&Z(u)?!1:t in e}const Tc={};/*! #__NO_SIDE_EFFECTS__ */function a0(e,t,u){const n=Kn(e,t);ti(n)&&te(n,t);class r extends Ei{constructor(s){super(n,s,u)}}return r.def=n,r}/*! #__NO_SIDE_EFFECTS__ */const Lp=(e,t)=>a0(e,t,C0),Bp=typeof HTMLElement<"u"?HTMLElement:class{};class Ei extends Bp{constructor(t,u={},n=Ur){super(),this._def=t,this._props=u,this._createApp=n,this._isVueCE=!0,this._instance=null,this._app=null,this._nonce=this._def.nonce,this._connected=!1,this._resolved=!1,this._numberProps=null,this._styleChildren=new WeakSet,this._ob=null,this.shadowRoot&&n!==Ur?this._root=this.shadowRoot:t.shadowRoot!==!1?(this.attachShadow({mode:"open"}),this._root=this.shadowRoot):this._root=this,this._def.__asyncLoader||this._resolveProps(this._def)}connectedCallback(){if(!this.isConnected)return;this.shadowRoot||this._parseSlots(),this._connected=!0;let t=this;for(;t=t&&(t.parentNode||t.host);)if(t instanceof Ei){this._parent=t;break}this._instance||(this._resolved?(this._setParent(),this._update()):t&&t._pendingResolve?this._pendingResolve=t._pendingResolve.then(()=>{this._pendingResolve=void 0,this._resolveDef()}):this._resolveDef())}_setParent(t=this._parent){t&&(this._instance.parent=t._instance,this._instance.provides=t._instance.provides)}disconnectedCallback(){this._connected=!1,fi(()=>{this._connected||(this._ob&&(this._ob.disconnect(),this._ob=null),this._app&&this._app.unmount(),this._instance&&(this._instance.ce=void 0),this._app=this._instance=null)})}_resolveDef(){if(this._pendingResolve)return;for(let n=0;n<this.attributes.length;n++)this._setAttr(this.attributes[n].name);this._ob=new MutationObserver(n=>{for(const r of n)this._setAttr(r.attributeName)}),this._ob.observe(this,{attributes:!0});const t=(n,r=!1)=>{this._resolved=!0,this._pendingResolve=void 0;const{props:i,styles:s}=n;let o;if(i&&!$(i))for(const c in i){const l=i[c];(l===Number||l&&l.type===Number)&&(c in this._props&&(this._props[c]=vr(this._props[c])),(o||(o=Object.create(null)))[de(c)]=!0)}this._numberProps=o,r&&this._resolveProps(n),this.shadowRoot&&this._applyStyles(s),this._mount(n)},u=this._def.__asyncLoader;u?this._pendingResolve=u().then(n=>t(this._def=n,!0)):t(this._def)}_mount(t){__VUE_PROD_DEVTOOLS__&&!t.name&&(t.name="VueElement"),this._app=this._createApp(t),t.configureApp&&t.configureApp(this._app),this._app._ceVNode=this._createVNode(),this._app.mount(this._root);const u=this._instance&&this._instance.exposed;if(u)for(const n in u)re(this,n)||Object.defineProperty(this,n,{get:()=>ai(u[n])})}_resolveProps(t){const{props:u}=t,n=$(u)?u:Object.keys(u||{});for(const r of Object.keys(this))r[0]!=="_"&&n.includes(r)&&this._setProp(r,this[r]);for(const r of n.map(de))Object.defineProperty(this,r,{get(){return this._getProp(r)},set(i){this._setProp(r,i,!0,!0)}})}_setAttr(t){if(t.startsWith("data-v-"))return;const u=this.hasAttribute(t);let n=u?this.getAttribute(t):Tc;const r=de(t);u&&this._numberProps&&this._numberProps[r]&&(n=vr(n)),this._setProp(r,n,!1,!0)}_getProp(t){return this._props[t]}_setProp(t,u,n=!0,r=!1){if(u!==this._props[t]&&(u===Tc?delete this._props[t]:(this._props[t]=u,t==="key"&&this._app&&(this._app._ceVNode.key=u)),r&&this._instance&&this._update(),n)){const i=this._ob;i&&i.disconnect(),u===!0?this.setAttribute(qe(t),""):typeof u=="string"||typeof u=="number"?this.setAttribute(qe(t),u+""):u||this.removeAttribute(qe(t)),i&&i.observe(this,{attributes:!0})}}_update(){E0(this._createVNode(),this._root)}_createVNode(){const t={};this.shadowRoot||(t.onVnodeMounted=t.onVnodeUpdated=this._renderSlots.bind(this));const u=fe(this._def,te(t,this._props));return this._instance||(u.ce=n=>{this._instance=n,n.ce=this,n.isCE=!0;const r=(i,s)=>{this.dispatchEvent(new CustomEvent(i,ti(s[0])?te({detail:s},s[0]):{detail:s}))};n.emit=(i,...s)=>{r(i,s),qe(i)!==i&&r(qe(i),s)},this._setParent()}),u}_applyStyles(t,u){if(!t)return;if(u){if(u===this._def||this._styleChildren.has(u))return;this._styleChildren.add(u)}const n=this._nonce;for(let r=t.length-1;r>=0;r--){const i=document.createElement("style");n&&i.setAttribute("nonce",n),i.textContent=t[r],this.shadowRoot.prepend(i)}}_parseSlots(){const t=this._slots={};let u;for(;u=this.firstChild;){const n=u.nodeType===1&&u.getAttribute("slot")||"default";(t[n]||(t[n]=[])).push(u),this.removeChild(u)}}_renderSlots(){const t=(this._teleportTarget||this).querySelectorAll("slot"),u=this._instance.type.__scopeId;for(let n=0;n<t.length;n++){const r=t[n],i=r.getAttribute("name")||"default",s=this._slots[i],o=r.parentNode;if(s)for(const c of s){if(u&&c.nodeType===1){const l=u+"-s",a=document.createTreeWalker(c,1);c.setAttribute(l,"");let f;for(;f=a.nextNode();)f.setAttribute(l,"")}o.insertBefore(c,r)}else for(;r.firstChild;)o.insertBefore(r.firstChild,r);o.removeChild(r)}}_injectChildStyle(t){this._applyStyles(t.styles,t)}_removeChildStyle(t){}}function f0(e){const t=ft(),u=t&&t.ce;return u||null}function Vp(){const e=f0();return e&&e.shadowRoot}function $p(e="$style"){{const t=ft();if(!t)return ee;const u=t.type.__cssModules;if(!u)return ee;const n=u[e];return n||ee}}const d0=new WeakMap,h0=new WeakMap,$r=Symbol("_moveCb"),wc=Symbol("_enterCb"),Hp=e=>(delete e.props.mode,e),Up=Hp({name:"TransitionGroup",props:te({},r0,{tag:String,moveClass:String}),setup(e,{slots:t}){const u=ft(),n=Js();let r,i;return bi(()=>{if(!r.length)return;const s=e.moveClass||`${e.name||"v"}-move`;if(!Wp(r[0].el,u.vnode.el,s))return;r.forEach(jp),r.forEach(zp);const o=r.filter(Kp);Es(),o.forEach(c=>{const l=c.el,a=l.style;Et(l,s),a.transform=a.webkitTransform=a.transitionDuration="";const f=l[$r]=p=>{p&&p.target!==l||(!p||/transform$/.test(p.propertyName))&&(l.removeEventListener("transitionend",f),l[$r]=null,Kt(l,s))};l.addEventListener("transitionend",f)})}),()=>{const s=Q(e),o=i0(s);let c=s.tag||Ee;if(r=[],i)for(let l=0;l<i.length;l++){const a=i[l];a.el&&a.el instanceof Element&&(r.push(a),Bt(a,zu(a,o,n,u)),d0.set(a,a.el.getBoundingClientRect()))}i=t.default?pi(t.default()):[];for(let l=0;l<i.length;l++){const a=i[l];a.key!=null&&Bt(a,zu(a,o,n,u))}return fe(c,null,i)}}}),qp=Up;function jp(e){const t=e.el;t[$r]&&t[$r](),t[wc]&&t[wc]()}function zp(e){h0.set(e,e.el.getBoundingClientRect())}function Kp(e){const t=d0.get(e),u=h0.get(e),n=t.left-u.left,r=t.top-u.top;if(n||r){const i=e.el.style;return i.transform=i.webkitTransform=`translate(${n}px,${r}px)`,i.transitionDuration="0s",e}}function Wp(e,t,u){const n=e.cloneNode(),r=e[Wu];r&&r.forEach(o=>{o.split(/\s+/).forEach(c=>c&&n.classList.remove(c))}),u.split(/\s+/).forEach(o=>o&&n.classList.add(o)),n.style.display="none";const i=t.nodeType===1?t:t.parentNode;i.appendChild(n);const{hasTransform:s}=s0(n);return i.removeChild(n),s}const cu=e=>{const t=e.props["onUpdate:modelValue"]||!1;return $(t)?u=>Vu(t,u):t};function Gp(e){e.target.composing=!0}function Fc(e){const t=e.target;t.composing&&(t.composing=!1,t.dispatchEvent(new Event("input")))}const ct=Symbol("_assign"),Hr={created(e,{modifiers:{lazy:t,trim:u,number:n}},r){e[ct]=cu(r);const i=n||r.props&&r.props.type==="number";Mt(e,t?"change":"input",s=>{if(s.target.composing)return;let o=e.value;u&&(o=o.trim()),i&&(o=Sr(o)),e[ct](o)}),u&&Mt(e,"change",()=>{e.value=e.value.trim()}),t||(Mt(e,"compositionstart",Gp),Mt(e,"compositionend",Fc),Mt(e,"change",Fc))},mounted(e,{value:t}){e.value=t??""},beforeUpdate(e,{value:t,oldValue:u,modifiers:{lazy:n,trim:r,number:i}},s){if(e[ct]=cu(s),e.composing)return;const o=(i||e.type==="number")&&!/^0\d/.test(e.value)?Sr(e.value):e.value,c=t??"";o!==c&&(document.activeElement===e&&e.type!=="range"&&(n&&t===u||r&&e.value.trim()===c)||(e.value=c))}},fo={deep:!0,created(e,t,u){e[ct]=cu(u),Mt(e,"change",()=>{const n=e._modelValue,r=Gu(e),i=e.checked,s=e[ct];if($(n)){const o=ni(n,r),c=o!==-1;if(i&&!c)s(n.concat(r));else if(!i&&c){const l=[...n];l.splice(o,1),s(l)}}else if(Du(n)){const o=new Set(n);i?o.add(r):o.delete(r),s(o)}else s(m0(e,i))})},mounted:Ic,beforeUpdate(e,t,u){e[ct]=cu(u),Ic(e,t,u)}};function Ic(e,{value:t,oldValue:u},n){e._modelValue=t;let r;if($(t))r=ni(t,n.props.value)>-1;else if(Du(t))r=t.has(n.props.value);else{if(t===u)return;r=su(t,m0(e,!0))}e.checked!==r&&(e.checked=r)}const ho={created(e,{value:t},u){e.checked=su(t,u.props.value),e[ct]=cu(u),Mt(e,"change",()=>{e[ct](Gu(e))})},beforeUpdate(e,{value:t,oldValue:u},n){e[ct]=cu(n),t!==u&&(e.checked=su(t,n.props.value))}},p0={deep:!0,created(e,{value:t,modifiers:{number:u}},n){const r=Du(t);Mt(e,"change",()=>{const i=Array.prototype.filter.call(e.options,s=>s.selected).map(s=>u?Sr(Gu(s)):Gu(s));e[ct](e.multiple?r?new Set(i):i:i[0]),e._assigning=!0,fi(()=>{e._assigning=!1})}),e[ct]=cu(n)},mounted(e,{value:t}){Oc(e,t)},beforeUpdate(e,t,u){e[ct]=cu(u)},updated(e,{value:t}){e._assigning||Oc(e,t)}};function Oc(e,t){const u=e.multiple,n=$(t);if(!(u&&!n&&!Du(t))){for(let r=0,i=e.options.length;r<i;r++){const s=e.options[r],o=Gu(s);if(u)if(n){const c=typeof o;c==="string"||c==="number"?s.selected=t.some(l=>String(l)===String(o)):s.selected=ni(t,o)>-1}else s.selected=t.has(o);else if(su(Gu(s),t)){e.selectedIndex!==r&&(e.selectedIndex=r);return}}!u&&e.selectedIndex!==-1&&(e.selectedIndex=-1)}}function Gu(e){return"_value"in e?e._value:e.value}function m0(e,t){const u=t?"_trueValue":"_falseValue";return u in e?e[u]:t}const b0={created(e,t,u){fr(e,t,u,null,"created")},mounted(e,t,u){fr(e,t,u,null,"mounted")},beforeUpdate(e,t,u,n){fr(e,t,u,n,"beforeUpdate")},updated(e,t,u,n){fr(e,t,u,n,"updated")}};function g0(e,t){switch(e){case"SELECT":return p0;case"TEXTAREA":return Hr;default:switch(t){case"checkbox":return fo;case"radio":return ho;default:return Hr}}}function fr(e,t,u,n,r){const s=g0(e.tagName,u.props&&u.props.type)[r];s&&s(e,t,u,n)}function Zp(){Hr.getSSRProps=({value:e})=>({value:e}),ho.getSSRProps=({value:e},t)=>{if(t.props&&su(t.props.value,e))return{checked:!0}},fo.getSSRProps=({value:e},t)=>{if($(e)){if(t.props&&ni(e,t.props.value)>-1)return{checked:!0}}else if(Du(e)){if(t.props&&e.has(t.props.value))return{checked:!0}}else if(e)return{checked:!0}},b0.getSSRProps=(e,t)=>{if(typeof t.type!="string")return;const u=g0(t.type.toUpperCase(),t.props&&t.props.type);if(u.getSSRProps)return u.getSSRProps(e,t)}}const Jp=["ctrl","shift","alt","meta"],Yp={stop:e=>e.stopPropagation(),prevent:e=>e.preventDefault(),self:e=>e.target!==e.currentTarget,ctrl:e=>!e.ctrlKey,shift:e=>!e.shiftKey,alt:e=>!e.altKey,meta:e=>!e.metaKey,left:e=>"button"in e&&e.button!==0,middle:e=>"button"in e&&e.button!==1,right:e=>"button"in e&&e.button!==2,exact:(e,t)=>Jp.some(u=>e[`${u}Key`]&&!t.includes(u))},Xp=(e,t)=>{const u=e._withMods||(e._withMods={}),n=t.join(".");return u[n]||(u[n]=(r,...i)=>{for(let s=0;s<t.length;s++){const o=Yp[t[s]];if(o&&o(r,t))return}return e(r,...i)})},Qp={esc:"escape",space:" ",up:"arrow-up",left:"arrow-left",right:"arrow-right",down:"arrow-down",delete:"backspace"},e2=(e,t)=>{const u=e._withKeys||(e._withKeys={}),n=t.join(".");return u[n]||(u[n]=r=>{if(!("key"in r))return;const i=qe(r.key);if(t.some(s=>s===i||Qp[s]===i))return e(r)})},_0=te({patchProp:Mp},gp);let An,Nc=!1;function y0(){return An||(An=Fa(_0))}function x0(){return An=Nc?An:Ia(_0),Nc=!0,An}const E0=(...e)=>{y0().render(...e)},t2=(...e)=>{x0().hydrate(...e)},Ur=(...e)=>{const t=y0().createApp(...e),{mount:u}=t;return t.mount=n=>{const r=A0(n);if(!r)return;const i=t._component;!W(i)&&!i.render&&!i.template&&(i.template=r.innerHTML),r.nodeType===1&&(r.textContent="");const s=u(r,!1,k0(r));return r instanceof Element&&(r.removeAttribute("v-cloak"),r.setAttribute("data-v-app","")),s},t},C0=(...e)=>{const t=x0().createApp(...e),{mount:u}=t;return t.mount=n=>{const r=A0(n);if(r)return u(r,!0,k0(r))},t};function k0(e){if(e instanceof SVGElement)return"svg";if(typeof MathMLElement=="function"&&e instanceof MathMLElement)return"mathml"}function A0(e){return Z(e)?document.querySelector(e):e}let Rc=!1;const u2=()=>{Rc||(Rc=!0,Zp(),kp())},n2=Object.freeze(Object.defineProperty({__proto__:null,BaseTransition:ra,BaseTransitionPropsValidators:Ys,Comment:ye,DeprecationTypes:pp,EffectScope:Vs,ErrorCodes:lh,ErrorTypeStrings:op,Fragment:Ee,KeepAlive:qh,ReactiveEffect:vn,Static:ru,Suspense:V1,Teleport:kh,Text:Pt,TrackOpTypes:Xd,Transition:yp,TransitionGroup:qp,TriggerOpTypes:Qd,VueElement:Ei,assertNumber:ch,callWithAsyncErrorHandling:lt,callWithErrorHandling:wu,camelize:de,capitalize:Tu,cloneVNode:vt,compatUtils:hp,computed:lo,createApp:Ur,createBlock:Rr,createCommentVNode:Z1,createElementBlock:z1,createElementVNode:so,createHydrationRenderer:Ia,createPropsRestProxy:f1,createRenderer:Fa,createSSRApp:C0,createSlots:Jh,createStaticVNode:G1,createTextVNode:oo,createVNode:fe,customRef:jl,defineAsyncComponent:Hh,defineComponent:Kn,defineCustomElement:a0,defineEmits:t1,defineExpose:u1,defineModel:i1,defineOptions:n1,defineProps:e1,defineSSRCustomElement:Lp,defineSlots:r1,devtools:cp,effect:xd,effectScope:gd,getCurrentInstance:ft,getCurrentScope:Sl,getCurrentWatcher:eh,getTransitionRawChildren:pi,guardReactiveProps:Ka,h:ao,handleError:Fu,hasInjectionContext:x1,hydrate:t2,hydrateOnIdle:Mh,hydrateOnInteraction:Vh,hydrateOnMediaQuery:Bh,hydrateOnVisible:Lh,initCustomFormatter:rp,initDirectivesForSSR:u2,inject:Cn,isMemoSame:t0,isProxy:li,isReactive:uu,isReadonly:ou,isRef:Ce,isRuntimeOnly:Q1,isShallow:ut,isVNode:Vt,markRaw:Ul,mergeDefaults:l1,mergeModels:a1,mergeProps:Wa,nextTick:fi,normalizeClass:nn,normalizeProps:td,normalizeStyle:un,onActivated:ca,onBeforeMount:fa,onBeforeUnmount:gi,onBeforeUpdate:Qs,onDeactivated:la,onErrorCaptured:ma,onMounted:Gn,onRenderTracked:pa,onRenderTriggered:ha,onScopeDispose:_d,onServerPrefetch:da,onUnmounted:_i,onUpdated:bi,onWatcherCleanup:Kl,openBlock:Mn,popScopeId:yh,provide:xa,proxyRefs:zs,pushScopeId:_h,queuePostFlushCb:wn,reactive:oi,readonly:qs,ref:yn,registerRuntimeCompiler:Ya,render:E0,renderList:Zh,renderSlot:Yh,resolveComponent:Kh,resolveDirective:Gh,resolveDynamicComponent:Wh,resolveFilter:dp,resolveTransitionHooks:zu,setBlockTracking:ps,setDevtoolsHook:lp,setTransitionHooks:Bt,shallowReactive:Hl,shallowReadonly:$d,shallowRef:js,ssrContextKey:Ma,ssrUtils:fp,stop:Ed,toDisplayString:kl,toHandlerKey:Bu,toHandlers:Xh,toRaw:Q,toRef:Zd,toRefs:Kd,toValue:qd,transformVNodeArgs:K1,triggerRef:Ud,unref:ai,useAttrs:c1,useCssModule:$p,useCssVars:Ap,useHost:f0,useId:Sh,useModel:O1,useSSRContext:Pa,useShadowRoot:Vp,useSlots:o1,useTemplateRef:vh,useTransitionState:Js,vModelCheckbox:fo,vModelDynamic:b0,vModelRadio:ho,vModelSelect:p0,vModelText:Hr,vShow:c0,version:ys,warn:sp,watch:Hu,watchEffect:w1,watchPostEffect:F1,watchSyncEffect:La,withAsyncContext:d1,withCtx:Zs,withDefaults:s1,withDirectives:Eh,withKeys:e2,withMemo:ip,withModifiers:Xp,withScopeId:xh},Symbol.toStringTag,{value:"Module"}));/**
-* @vue/compiler-core v3.5.13
-* (c) 2018-present Yuxi (Evan) You and Vue contributors
-* @license MIT
-**/const Pn=Symbol(""),Sn=Symbol(""),po=Symbol(""),qr=Symbol(""),S0=Symbol(""),Su=Symbol(""),v0=Symbol(""),D0=Symbol(""),mo=Symbol(""),bo=Symbol(""),Yn=Symbol(""),go=Symbol(""),T0=Symbol(""),_o=Symbol(""),yo=Symbol(""),xo=Symbol(""),Eo=Symbol(""),Co=Symbol(""),ko=Symbol(""),w0=Symbol(""),F0=Symbol(""),Ci=Symbol(""),jr=Symbol(""),Ao=Symbol(""),So=Symbol(""),Ln=Symbol(""),Xn=Symbol(""),vo=Symbol(""),ks=Symbol(""),r2=Symbol(""),As=Symbol(""),zr=Symbol(""),i2=Symbol(""),s2=Symbol(""),Do=Symbol(""),o2=Symbol(""),c2=Symbol(""),To=Symbol(""),I0=Symbol(""),Zu={[Pn]:"Fragment",[Sn]:"Teleport",[po]:"Suspense",[qr]:"KeepAlive",[S0]:"BaseTransition",[Su]:"openBlock",[v0]:"createBlock",[D0]:"createElementBlock",[mo]:"createVNode",[bo]:"createElementVNode",[Yn]:"createCommentVNode",[go]:"createTextVNode",[T0]:"createStaticVNode",[_o]:"resolveComponent",[yo]:"resolveDynamicComponent",[xo]:"resolveDirective",[Eo]:"resolveFilter",[Co]:"withDirectives",[ko]:"renderList",[w0]:"renderSlot",[F0]:"createSlots",[Ci]:"toDisplayString",[jr]:"mergeProps",[Ao]:"normalizeClass",[So]:"normalizeStyle",[Ln]:"normalizeProps",[Xn]:"guardReactiveProps",[vo]:"toHandlers",[ks]:"camelize",[r2]:"capitalize",[As]:"toHandlerKey",[zr]:"setBlockTracking",[i2]:"pushScopeId",[s2]:"popScopeId",[Do]:"withCtx",[o2]:"unref",[c2]:"isRef",[To]:"withMemo",[I0]:"isMemoSame"};function l2(e){Object.getOwnPropertySymbols(e).forEach(t=>{Zu[t]=e[t]})}const rt={start:{line:1,column:1,offset:0},end:{line:1,column:1,offset:0},source:""};function a2(e,t=""){return{type:0,source:t,children:e,helpers:new Set,components:[],directives:[],hoists:[],imports:[],cached:[],temps:0,codegenNode:void 0,loc:rt}}function Bn(e,t,u,n,r,i,s,o=!1,c=!1,l=!1,a=rt){return e&&(o?(e.helper(Su),e.helper(Xu(e.inSSR,l))):e.helper(Yu(e.inSSR,l)),s&&e.helper(Co)),{type:13,tag:t,props:u,children:n,patchFlag:r,dynamicProps:i,directives:s,isBlock:o,disableTracking:c,isComponent:l,loc:a}}function Eu(e,t=rt){return{type:17,loc:t,elements:e}}function ot(e,t=rt){return{type:15,loc:t,properties:e}}function xe(e,t){return{type:16,loc:rt,key:Z(e)?X(e,!0):e,value:t}}function X(e,t=!1,u=rt,n=0){return{type:4,loc:u,content:e,isStatic:t,constType:t?3:n}}function gt(e,t=rt){return{type:8,loc:t,children:e}}function Ae(e,t=[],u=rt){return{type:14,loc:u,callee:e,arguments:t}}function Ju(e,t=void 0,u=!1,n=!1,r=rt){return{type:18,params:e,returns:t,newline:u,isSlot:n,loc:r}}function Ss(e,t,u,n=!0){return{type:19,test:e,consequent:t,alternate:u,newline:n,loc:rt}}function f2(e,t,u=!1,n=!1){return{type:20,index:e,value:t,needPauseTracking:u,inVOnce:n,needArraySpread:!1,loc:rt}}function d2(e){return{type:21,body:e,loc:rt}}function Yu(e,t){return e||t?mo:bo}function Xu(e,t){return e||t?v0:D0}function wo(e,{helper:t,removeHelper:u,inSSR:n}){e.isBlock||(e.isBlock=!0,u(Yu(n,e.isComponent)),t(Su),t(Xu(n,e.isComponent)))}const Mc=new Uint8Array([123,123]),Pc=new Uint8Array([125,125]);function Lc(e){return e>=97&&e<=122||e>=65&&e<=90}function et(e){return e===32||e===10||e===9||e===12||e===13}function jt(e){return e===47||e===62||et(e)}function Kr(e){const t=new Uint8Array(e.length);for(let u=0;u<e.length;u++)t[u]=e.charCodeAt(u);return t}const Oe={Cdata:new Uint8Array([67,68,65,84,65,91]),CdataEnd:new Uint8Array([93,93,62]),CommentEnd:new Uint8Array([45,45,62]),ScriptEnd:new Uint8Array([60,47,115,99,114,105,112,116]),StyleEnd:new Uint8Array([60,47,115,116,121,108,101]),TitleEnd:new Uint8Array([60,47,116,105,116,108,101]),TextareaEnd:new Uint8Array([60,47,116,101,120,116,97,114,101,97])};class h2{constructor(t,u){this.stack=t,this.cbs=u,this.state=1,this.buffer="",this.sectionStart=0,this.index=0,this.entityStart=0,this.baseState=1,this.inRCDATA=!1,this.inXML=!1,this.inVPre=!1,this.newlines=[],this.mode=0,this.delimiterOpen=Mc,this.delimiterClose=Pc,this.delimiterIndex=-1,this.currentSequence=void 0,this.sequenceIndex=0}get inSFCRoot(){return this.mode===2&&this.stack.length===0}reset(){this.state=1,this.mode=0,this.buffer="",this.sectionStart=0,this.index=0,this.baseState=1,this.inRCDATA=!1,this.currentSequence=void 0,this.newlines.length=0,this.delimiterOpen=Mc,this.delimiterClose=Pc}getPos(t){let u=1,n=t+1;for(let r=this.newlines.length-1;r>=0;r--){const i=this.newlines[r];if(t>i){u=r+2,n=t-i;break}}return{column:n,line:u,offset:t}}peek(){return this.buffer.charCodeAt(this.index+1)}stateText(t){t===60?(this.index>this.sectionStart&&this.cbs.ontext(this.sectionStart,this.index),this.state=5,this.sectionStart=this.index):!this.inVPre&&t===this.delimiterOpen[0]&&(this.state=2,this.delimiterIndex=0,this.stateInterpolationOpen(t))}stateInterpolationOpen(t){if(t===this.delimiterOpen[this.delimiterIndex])if(this.delimiterIndex===this.delimiterOpen.length-1){const u=this.index+1-this.delimiterOpen.length;u>this.sectionStart&&this.cbs.ontext(this.sectionStart,u),this.state=3,this.sectionStart=u}else this.delimiterIndex++;else this.inRCDATA?(this.state=32,this.stateInRCDATA(t)):(this.state=1,this.stateText(t))}stateInterpolation(t){t===this.delimiterClose[0]&&(this.state=4,this.delimiterIndex=0,this.stateInterpolationClose(t))}stateInterpolationClose(t){t===this.delimiterClose[this.delimiterIndex]?this.delimiterIndex===this.delimiterClose.length-1?(this.cbs.oninterpolation(this.sectionStart,this.index+1),this.inRCDATA?this.state=32:this.state=1,this.sectionStart=this.index+1):this.delimiterIndex++:(this.state=3,this.stateInterpolation(t))}stateSpecialStartSequence(t){const u=this.sequenceIndex===this.currentSequence.length;if(!(u?jt(t):(t|32)===this.currentSequence[this.sequenceIndex]))this.inRCDATA=!1;else if(!u){this.sequenceIndex++;return}this.sequenceIndex=0,this.state=6,this.stateInTagName(t)}stateInRCDATA(t){if(this.sequenceIndex===this.currentSequence.length){if(t===62||et(t)){const u=this.index-this.currentSequence.length;if(this.sectionStart<u){const n=this.index;this.index=u,this.cbs.ontext(this.sectionStart,u),this.index=n}this.sectionStart=u+2,this.stateInClosingTagName(t),this.inRCDATA=!1;return}this.sequenceIndex=0}(t|32)===this.currentSequence[this.sequenceIndex]?this.sequenceIndex+=1:this.sequenceIndex===0?this.currentSequence===Oe.TitleEnd||this.currentSequence===Oe.TextareaEnd&&!this.inSFCRoot?!this.inVPre&&t===this.delimiterOpen[0]&&(this.state=2,this.delimiterIndex=0,this.stateInterpolationOpen(t)):this.fastForwardTo(60)&&(this.sequenceIndex=1):this.sequenceIndex=+(t===60)}stateCDATASequence(t){t===Oe.Cdata[this.sequenceIndex]?++this.sequenceIndex===Oe.Cdata.length&&(this.state=28,this.currentSequence=Oe.CdataEnd,this.sequenceIndex=0,this.sectionStart=this.index+1):(this.sequenceIndex=0,this.state=23,this.stateInDeclaration(t))}fastForwardTo(t){for(;++this.index<this.buffer.length;){const u=this.buffer.charCodeAt(this.index);if(u===10&&this.newlines.push(this.index),u===t)return!0}return this.index=this.buffer.length-1,!1}stateInCommentLike(t){t===this.currentSequence[this.sequenceIndex]?++this.sequenceIndex===this.currentSequence.length&&(this.currentSequence===Oe.CdataEnd?this.cbs.oncdata(this.sectionStart,this.index-2):this.cbs.oncomment(this.sectionStart,this.index-2),this.sequenceIndex=0,this.sectionStart=this.index+1,this.state=1):this.sequenceIndex===0?this.fastForwardTo(this.currentSequence[0])&&(this.sequenceIndex=1):t!==this.currentSequence[this.sequenceIndex-1]&&(this.sequenceIndex=0)}startSpecial(t,u){this.enterRCDATA(t,u),this.state=31}enterRCDATA(t,u){this.inRCDATA=!0,this.currentSequence=t,this.sequenceIndex=u}stateBeforeTagName(t){t===33?(this.state=22,this.sectionStart=this.index+1):t===63?(this.state=24,this.sectionStart=this.index+1):Lc(t)?(this.sectionStart=this.index,this.mode===0?this.state=6:this.inSFCRoot?this.state=34:this.inXML?this.state=6:t===116?this.state=30:this.state=t===115?29:6):t===47?this.state=8:(this.state=1,this.stateText(t))}stateInTagName(t){jt(t)&&this.handleTagName(t)}stateInSFCRootTagName(t){if(jt(t)){const u=this.buffer.slice(this.sectionStart,this.index);u!=="template"&&this.enterRCDATA(Kr("</"+u),0),this.handleTagName(t)}}handleTagName(t){this.cbs.onopentagname(this.sectionStart,this.index),this.sectionStart=-1,this.state=11,this.stateBeforeAttrName(t)}stateBeforeClosingTagName(t){et(t)||(t===62?(this.state=1,this.sectionStart=this.index+1):(this.state=Lc(t)?9:27,this.sectionStart=this.index))}stateInClosingTagName(t){(t===62||et(t))&&(this.cbs.onclosetag(this.sectionStart,this.index),this.sectionStart=-1,this.state=10,this.stateAfterClosingTagName(t))}stateAfterClosingTagName(t){t===62&&(this.state=1,this.sectionStart=this.index+1)}stateBeforeAttrName(t){t===62?(this.cbs.onopentagend(this.index),this.inRCDATA?this.state=32:this.state=1,this.sectionStart=this.index+1):t===47?this.state=7:t===60&&this.peek()===47?(this.cbs.onopentagend(this.index),this.state=5,this.sectionStart=this.index):et(t)||this.handleAttrStart(t)}handleAttrStart(t){t===118&&this.peek()===45?(this.state=13,this.sectionStart=this.index):t===46||t===58||t===64||t===35?(this.cbs.ondirname(this.index,this.index+1),this.state=14,this.sectionStart=this.index+1):(this.state=12,this.sectionStart=this.index)}stateInSelfClosingTag(t){t===62?(this.cbs.onselfclosingtag(this.index),this.state=1,this.sectionStart=this.index+1,this.inRCDATA=!1):et(t)||(this.state=11,this.stateBeforeAttrName(t))}stateInAttrName(t){(t===61||jt(t))&&(this.cbs.onattribname(this.sectionStart,this.index),this.handleAttrNameEnd(t))}stateInDirName(t){t===61||jt(t)?(this.cbs.ondirname(this.sectionStart,this.index),this.handleAttrNameEnd(t)):t===58?(this.cbs.ondirname(this.sectionStart,this.index),this.state=14,this.sectionStart=this.index+1):t===46&&(this.cbs.ondirname(this.sectionStart,this.index),this.state=16,this.sectionStart=this.index+1)}stateInDirArg(t){t===61||jt(t)?(this.cbs.ondirarg(this.sectionStart,this.index),this.handleAttrNameEnd(t)):t===91?this.state=15:t===46&&(this.cbs.ondirarg(this.sectionStart,this.index),this.state=16,this.sectionStart=this.index+1)}stateInDynamicDirArg(t){t===93?this.state=14:(t===61||jt(t))&&(this.cbs.ondirarg(this.sectionStart,this.index+1),this.handleAttrNameEnd(t))}stateInDirModifier(t){t===61||jt(t)?(this.cbs.ondirmodifier(this.sectionStart,this.index),this.handleAttrNameEnd(t)):t===46&&(this.cbs.ondirmodifier(this.sectionStart,this.index),this.sectionStart=this.index+1)}handleAttrNameEnd(t){this.sectionStart=this.index,this.state=17,this.cbs.onattribnameend(this.index),this.stateAfterAttrName(t)}stateAfterAttrName(t){t===61?this.state=18:t===47||t===62?(this.cbs.onattribend(0,this.sectionStart),this.sectionStart=-1,this.state=11,this.stateBeforeAttrName(t)):et(t)||(this.cbs.onattribend(0,this.sectionStart),this.handleAttrStart(t))}stateBeforeAttrValue(t){t===34?(this.state=19,this.sectionStart=this.index+1):t===39?(this.state=20,this.sectionStart=this.index+1):et(t)||(this.sectionStart=this.index,this.state=21,this.stateInAttrValueNoQuotes(t))}handleInAttrValue(t,u){(t===u||this.fastForwardTo(u))&&(this.cbs.onattribdata(this.sectionStart,this.index),this.sectionStart=-1,this.cbs.onattribend(u===34?3:2,this.index+1),this.state=11)}stateInAttrValueDoubleQuotes(t){this.handleInAttrValue(t,34)}stateInAttrValueSingleQuotes(t){this.handleInAttrValue(t,39)}stateInAttrValueNoQuotes(t){et(t)||t===62?(this.cbs.onattribdata(this.sectionStart,this.index),this.sectionStart=-1,this.cbs.onattribend(1,this.index),this.state=11,this.stateBeforeAttrName(t)):(t===39||t===60||t===61||t===96)&&this.cbs.onerr(18,this.index)}stateBeforeDeclaration(t){t===91?(this.state=26,this.sequenceIndex=0):this.state=t===45?25:23}stateInDeclaration(t){(t===62||this.fastForwardTo(62))&&(this.state=1,this.sectionStart=this.index+1)}stateInProcessingInstruction(t){(t===62||this.fastForwardTo(62))&&(this.cbs.onprocessinginstruction(this.sectionStart,this.index),this.state=1,this.sectionStart=this.index+1)}stateBeforeComment(t){t===45?(this.state=28,this.currentSequence=Oe.CommentEnd,this.sequenceIndex=2,this.sectionStart=this.index+1):this.state=23}stateInSpecialComment(t){(t===62||this.fastForwardTo(62))&&(this.cbs.oncomment(this.sectionStart,this.index),this.state=1,this.sectionStart=this.index+1)}stateBeforeSpecialS(t){t===Oe.ScriptEnd[3]?this.startSpecial(Oe.ScriptEnd,4):t===Oe.StyleEnd[3]?this.startSpecial(Oe.StyleEnd,4):(this.state=6,this.stateInTagName(t))}stateBeforeSpecialT(t){t===Oe.TitleEnd[3]?this.startSpecial(Oe.TitleEnd,4):t===Oe.TextareaEnd[3]?this.startSpecial(Oe.TextareaEnd,4):(this.state=6,this.stateInTagName(t))}startEntity(){}stateInEntity(){}parse(t){for(this.buffer=t;this.index<this.buffer.length;){const u=this.buffer.charCodeAt(this.index);switch(u===10&&this.newlines.push(this.index),this.state){case 1:{this.stateText(u);break}case 2:{this.stateInterpolationOpen(u);break}case 3:{this.stateInterpolation(u);break}case 4:{this.stateInterpolationClose(u);break}case 31:{this.stateSpecialStartSequence(u);break}case 32:{this.stateInRCDATA(u);break}case 26:{this.stateCDATASequence(u);break}case 19:{this.stateInAttrValueDoubleQuotes(u);break}case 12:{this.stateInAttrName(u);break}case 13:{this.stateInDirName(u);break}case 14:{this.stateInDirArg(u);break}case 15:{this.stateInDynamicDirArg(u);break}case 16:{this.stateInDirModifier(u);break}case 28:{this.stateInCommentLike(u);break}case 27:{this.stateInSpecialComment(u);break}case 11:{this.stateBeforeAttrName(u);break}case 6:{this.stateInTagName(u);break}case 34:{this.stateInSFCRootTagName(u);break}case 9:{this.stateInClosingTagName(u);break}case 5:{this.stateBeforeTagName(u);break}case 17:{this.stateAfterAttrName(u);break}case 20:{this.stateInAttrValueSingleQuotes(u);break}case 18:{this.stateBeforeAttrValue(u);break}case 8:{this.stateBeforeClosingTagName(u);break}case 10:{this.stateAfterClosingTagName(u);break}case 29:{this.stateBeforeSpecialS(u);break}case 30:{this.stateBeforeSpecialT(u);break}case 21:{this.stateInAttrValueNoQuotes(u);break}case 7:{this.stateInSelfClosingTag(u);break}case 23:{this.stateInDeclaration(u);break}case 22:{this.stateBeforeDeclaration(u);break}case 25:{this.stateBeforeComment(u);break}case 24:{this.stateInProcessingInstruction(u);break}case 33:{this.stateInEntity();break}}this.index++}this.cleanup(),this.finish()}cleanup(){this.sectionStart!==this.index&&(this.state===1||this.state===32&&this.sequenceIndex===0?(this.cbs.ontext(this.sectionStart,this.index),this.sectionStart=this.index):(this.state===19||this.state===20||this.state===21)&&(this.cbs.onattribdata(this.sectionStart,this.index),this.sectionStart=this.index))}finish(){this.handleTrailingData(),this.cbs.onend()}handleTrailingData(){const t=this.buffer.length;this.sectionStart>=t||(this.state===28?this.currentSequence===Oe.CdataEnd?this.cbs.oncdata(this.sectionStart,t):this.cbs.oncomment(this.sectionStart,t):this.state===6||this.state===11||this.state===18||this.state===17||this.state===12||this.state===13||this.state===14||this.state===15||this.state===16||this.state===20||this.state===19||this.state===21||this.state===9||this.cbs.ontext(this.sectionStart,t))}emitCodePoint(t,u){}}function Bc(e,{compatConfig:t}){const u=t&&t[e];return e==="MODE"?u||3:u}function Cu(e,t){const u=Bc("MODE",t),n=Bc(e,t);return u===3?n===!0:n!==!1}function Vn(e,t,u,...n){return Cu(e,t)}function Fo(e){throw e}function O0(e){}function me(e,t,u,n){const r=`https://vuejs.org/error-reference/#compiler-${e}`,i=new SyntaxError(String(r));return i.code=e,i.loc=t,i}const Ge=e=>e.type===4&&e.isStatic;function N0(e){switch(e){case"Teleport":case"teleport":return Sn;case"Suspense":case"suspense":return po;case"KeepAlive":case"keep-alive":return qr;case"BaseTransition":case"base-transition":return S0}}const p2=/^\d|[^\$\w\xA0-\uFFFF]/,Io=e=>!p2.test(e),m2=/[A-Za-z_$\xA0-\uFFFF]/,b2=/[\.\?\w$\xA0-\uFFFF]/,g2=/\s+[.[]\s*|\s*[.[]\s+/g,R0=e=>e.type===4?e.content:e.loc.source,_2=e=>{const t=R0(e).trim().replace(g2,o=>o.trim());let u=0,n=[],r=0,i=0,s=null;for(let o=0;o<t.length;o++){const c=t.charAt(o);switch(u){case 0:if(c==="[")n.push(u),u=1,r++;else if(c==="(")n.push(u),u=2,i++;else if(!(o===0?m2:b2).test(c))return!1;break;case 1:c==="'"||c==='"'||c==="`"?(n.push(u),u=3,s=c):c==="["?r++:c==="]"&&(--r||(u=n.pop()));break;case 2:if(c==="'"||c==='"'||c==="`")n.push(u),u=3,s=c;else if(c==="(")i++;else if(c===")"){if(o===t.length-1)return!1;--i||(u=n.pop())}break;case 3:c===s&&(u=n.pop(),s=null);break}}return!r&&!i},M0=_2,y2=/^\s*(async\s*)?(\([^)]*?\)|[\w$_]+)\s*(:[^=]+)?=>|^\s*(async\s+)?function(?:\s+[\w$]+)?\s*\(/,x2=e=>y2.test(R0(e)),E2=x2;function st(e,t,u=!1){for(let n=0;n<e.props.length;n++){const r=e.props[n];if(r.type===7&&(u||r.exp)&&(Z(t)?r.name===t:t.test(r.name)))return r}}function ki(e,t,u=!1,n=!1){for(let r=0;r<e.props.length;r++){const i=e.props[r];if(i.type===6){if(u)continue;if(i.name===t&&(i.value||n))return i}else if(i.name==="bind"&&(i.exp||n)&&mu(i.arg,t))return i}}function mu(e,t){return!!(e&&Ge(e)&&e.content===t)}function C2(e){return e.props.some(t=>t.type===7&&t.name==="bind"&&(!t.arg||t.arg.type!==4||!t.arg.isStatic))}function Ki(e){return e.type===5||e.type===2}function k2(e){return e.type===7&&e.name==="slot"}function Wr(e){return e.type===1&&e.tagType===3}function Gr(e){return e.type===1&&e.tagType===2}const A2=new Set([Ln,Xn]);function P0(e,t=[]){if(e&&!Z(e)&&e.type===14){const u=e.callee;if(!Z(u)&&A2.has(u))return P0(e.arguments[0],t.concat(e))}return[e,t]}function Zr(e,t,u){let n,r=e.type===13?e.props:e.arguments[2],i=[],s;if(r&&!Z(r)&&r.type===14){const o=P0(r);r=o[0],i=o[1],s=i[i.length-1]}if(r==null||Z(r))n=ot([t]);else if(r.type===14){const o=r.arguments[0];!Z(o)&&o.type===15?Vc(t,o)||o.properties.unshift(t):r.callee===vo?n=Ae(u.helper(jr),[ot([t]),r]):r.arguments.unshift(ot([t])),!n&&(n=r)}else r.type===15?(Vc(t,r)||r.properties.unshift(t),n=r):(n=Ae(u.helper(jr),[ot([t]),r]),s&&s.callee===Xn&&(s=i[i.length-2]));e.type===13?s?s.arguments[0]=n:e.props=n:s?s.arguments[0]=n:e.arguments[2]=n}function Vc(e,t){let u=!1;if(e.key.type===4){const n=e.key.content;u=t.properties.some(r=>r.key.type===4&&r.key.content===n)}return u}function $n(e,t){return`_${t}_${e.replace(/[^\w]/g,(u,n)=>u==="-"?"_":e.charCodeAt(n).toString())}`}function S2(e){return e.type===14&&e.callee===To?e.arguments[1].returns:e}const v2=/([\s\S]*?)\s+(?:in|of)\s+(\S[\s\S]*)/,L0={parseMode:"base",ns:0,delimiters:["{{","}}"],getNamespace:()=>0,isVoidTag:dn,isPreTag:dn,isIgnoreNewlineTag:dn,isCustomElement:dn,onError:Fo,onWarn:O0,comments:!1,prefixIdentifiers:!1};let ie=L0,Hn=null,Lt="",Ne=null,ne=null,ze="",It=-1,hu=-1,Oo=0,Yt=!1,vs=null;const pe=[],ge=new h2(pe,{onerr:Ft,ontext(e,t){dr(Fe(e,t),e,t)},ontextentity(e,t,u){dr(e,t,u)},oninterpolation(e,t){if(Yt)return dr(Fe(e,t),e,t);let u=e+ge.delimiterOpen.length,n=t-ge.delimiterClose.length;for(;et(Lt.charCodeAt(u));)u++;for(;et(Lt.charCodeAt(n-1));)n--;let r=Fe(u,n);r.includes("&")&&(r=ie.decodeEntities(r,!1)),Ds({type:5,content:Cr(r,!1,_e(u,n)),loc:_e(e,t)})},onopentagname(e,t){const u=Fe(e,t);Ne={type:1,tag:u,ns:ie.getNamespace(u,pe[0],ie.ns),tagType:0,props:[],children:[],loc:_e(e-1,t),codegenNode:void 0}},onopentagend(e){Hc(e)},onclosetag(e,t){const u=Fe(e,t);if(!ie.isVoidTag(u)){let n=!1;for(let r=0;r<pe.length;r++)if(pe[r].tag.toLowerCase()===u.toLowerCase()){n=!0,r>0&&Ft(24,pe[0].loc.start.offset);for(let s=0;s<=r;s++){const o=pe.shift();Er(o,t,s<r)}break}n||Ft(23,B0(e,60))}},onselfclosingtag(e){const t=Ne.tag;Ne.isSelfClosing=!0,Hc(e),pe[0]&&pe[0].tag===t&&Er(pe.shift(),e)},onattribname(e,t){ne={type:6,name:Fe(e,t),nameLoc:_e(e,t),value:void 0,loc:_e(e)}},ondirname(e,t){const u=Fe(e,t),n=u==="."||u===":"?"bind":u==="@"?"on":u==="#"?"slot":u.slice(2);if(!Yt&&n===""&&Ft(26,e),Yt||n==="")ne={type:6,name:u,nameLoc:_e(e,t),value:void 0,loc:_e(e)};else if(ne={type:7,name:n,rawName:u,exp:void 0,arg:void 0,modifiers:u==="."?[X("prop")]:[],loc:_e(e)},n==="pre"){Yt=ge.inVPre=!0,vs=Ne;const r=Ne.props;for(let i=0;i<r.length;i++)r[i].type===7&&(r[i]=L2(r[i]))}},ondirarg(e,t){if(e===t)return;const u=Fe(e,t);if(Yt)ne.name+=u,bu(ne.nameLoc,t);else{const n=u[0]!=="[";ne.arg=Cr(n?u:u.slice(1,-1),n,_e(e,t),n?3:0)}},ondirmodifier(e,t){const u=Fe(e,t);if(Yt)ne.name+="."+u,bu(ne.nameLoc,t);else if(ne.name==="slot"){const n=ne.arg;n&&(n.content+="."+u,bu(n.loc,t))}else{const n=X(u,!0,_e(e,t));ne.modifiers.push(n)}},onattribdata(e,t){ze+=Fe(e,t),It<0&&(It=e),hu=t},onattribentity(e,t,u){ze+=e,It<0&&(It=t),hu=u},onattribnameend(e){const t=ne.loc.start.offset,u=Fe(t,e);ne.type===7&&(ne.rawName=u),Ne.props.some(n=>(n.type===7?n.rawName:n.name)===u)&&Ft(2,t)},onattribend(e,t){if(Ne&&ne){if(bu(ne.loc,t),e!==0)if(ze.includes("&")&&(ze=ie.decodeEntities(ze,!0)),ne.type===6)ne.name==="class"&&(ze=$0(ze).trim()),e===1&&!ze&&Ft(13,t),ne.value={type:2,content:ze,loc:e===1?_e(It,hu):_e(It-1,hu+1)},ge.inSFCRoot&&Ne.tag==="template"&&ne.name==="lang"&&ze&&ze!=="html"&&ge.enterRCDATA(Kr("</template"),0);else{let u=0;ne.exp=Cr(ze,!1,_e(It,hu),0,u),ne.name==="for"&&(ne.forParseResult=T2(ne.exp));let n=-1;ne.name==="bind"&&(n=ne.modifiers.findIndex(r=>r.content==="sync"))>-1&&Vn("COMPILER_V_BIND_SYNC",ie,ne.loc,ne.rawName)&&(ne.name="model",ne.modifiers.splice(n,1))}(ne.type!==7||ne.name!=="pre")&&Ne.props.push(ne)}ze="",It=hu=-1},oncomment(e,t){ie.comments&&Ds({type:3,content:Fe(e,t),loc:_e(e-4,t+3)})},onend(){const e=Lt.length;for(let t=0;t<pe.length;t++)Er(pe[t],e-1),Ft(24,pe[t].loc.start.offset)},oncdata(e,t){pe[0].ns!==0?dr(Fe(e,t),e,t):Ft(1,e-9)},onprocessinginstruction(e){(pe[0]?pe[0].ns:ie.ns)===0&&Ft(21,e-1)}}),$c=/,([^,\}\]]*)(?:,([^,\}\]]*))?$/,D2=/^\(|\)$/g;function T2(e){const t=e.loc,u=e.content,n=u.match(v2);if(!n)return;const[,r,i]=n,s=(f,p,d=!1)=>{const h=t.start.offset+p,_=h+f.length;return Cr(f,!1,_e(h,_),0,d?1:0)},o={source:s(i.trim(),u.indexOf(i,r.length)),value:void 0,key:void 0,index:void 0,finalized:!1};let c=r.trim().replace(D2,"").trim();const l=r.indexOf(c),a=c.match($c);if(a){c=c.replace($c,"").trim();const f=a[1].trim();let p;if(f&&(p=u.indexOf(f,l+c.length),o.key=s(f,p,!0)),a[2]){const d=a[2].trim();d&&(o.index=s(d,u.indexOf(d,o.key?p+f.length:l+c.length),!0))}}return c&&(o.value=s(c,l,!0)),o}function Fe(e,t){return Lt.slice(e,t)}function Hc(e){ge.inSFCRoot&&(Ne.innerLoc=_e(e+1,e+1)),Ds(Ne);const{tag:t,ns:u}=Ne;u===0&&ie.isPreTag(t)&&Oo++,ie.isVoidTag(t)?Er(Ne,e):(pe.unshift(Ne),(u===1||u===2)&&(ge.inXML=!0)),Ne=null}function dr(e,t,u){{const i=pe[0]&&pe[0].tag;i!=="script"&&i!=="style"&&e.includes("&")&&(e=ie.decodeEntities(e,!1))}const n=pe[0]||Hn,r=n.children[n.children.length-1];r&&r.type===2?(r.content+=e,bu(r.loc,u)):n.children.push({type:2,content:e,loc:_e(t,u)})}function Er(e,t,u=!1){u?bu(e.loc,B0(t,60)):bu(e.loc,w2(t,62)+1),ge.inSFCRoot&&(e.children.length?e.innerLoc.end=te({},e.children[e.children.length-1].loc.end):e.innerLoc.end=te({},e.innerLoc.start),e.innerLoc.source=Fe(e.innerLoc.start.offset,e.innerLoc.end.offset));const{tag:n,ns:r,children:i}=e;if(Yt||(n==="slot"?e.tagType=2:Uc(e)?e.tagType=3:I2(e)&&(e.tagType=1)),ge.inRCDATA||(e.children=V0(i)),r===0&&ie.isIgnoreNewlineTag(n)){const s=i[0];s&&s.type===2&&(s.content=s.content.replace(/^\r?\n/,""))}r===0&&ie.isPreTag(n)&&Oo--,vs===e&&(Yt=ge.inVPre=!1,vs=null),ge.inXML&&(pe[0]?pe[0].ns:ie.ns)===0&&(ge.inXML=!1);{const s=e.props;if(!ge.inSFCRoot&&Cu("COMPILER_NATIVE_TEMPLATE",ie)&&e.tag==="template"&&!Uc(e)){const c=pe[0]||Hn,l=c.children.indexOf(e);c.children.splice(l,1,...e.children)}const o=s.find(c=>c.type===6&&c.name==="inline-template");o&&Vn("COMPILER_INLINE_TEMPLATE",ie,o.loc)&&e.children.length&&(o.value={type:2,content:Fe(e.children[0].loc.start.offset,e.children[e.children.length-1].loc.end.offset),loc:o.loc})}}function w2(e,t){let u=e;for(;Lt.charCodeAt(u)!==t&&u<Lt.length-1;)u++;return u}function B0(e,t){let u=e;for(;Lt.charCodeAt(u)!==t&&u>=0;)u--;return u}const F2=new Set(["if","else","else-if","for","slot"]);function Uc({tag:e,props:t}){if(e==="template"){for(let u=0;u<t.length;u++)if(t[u].type===7&&F2.has(t[u].name))return!0}return!1}function I2({tag:e,props:t}){if(ie.isCustomElement(e))return!1;if(e==="component"||O2(e.charCodeAt(0))||N0(e)||ie.isBuiltInComponent&&ie.isBuiltInComponent(e)||ie.isNativeTag&&!ie.isNativeTag(e))return!0;for(let u=0;u<t.length;u++){const n=t[u];if(n.type===6){if(n.name==="is"&&n.value){if(n.value.content.startsWith("vue:"))return!0;if(Vn("COMPILER_IS_ON_ELEMENT",ie,n.loc))return!0}}else if(n.name==="bind"&&mu(n.arg,"is")&&Vn("COMPILER_IS_ON_ELEMENT",ie,n.loc))return!0}return!1}function O2(e){return e>64&&e<91}const N2=/\r\n/g;function V0(e,t){const u=ie.whitespace!=="preserve";let n=!1;for(let r=0;r<e.length;r++){const i=e[r];if(i.type===2)if(Oo)i.content=i.content.replace(N2,`
-`);else if(R2(i.content)){const s=e[r-1]&&e[r-1].type,o=e[r+1]&&e[r+1].type;!s||!o||u&&(s===3&&(o===3||o===1)||s===1&&(o===3||o===1&&M2(i.content)))?(n=!0,e[r]=null):i.content=" "}else u&&(i.content=$0(i.content))}return n?e.filter(Boolean):e}function R2(e){for(let t=0;t<e.length;t++)if(!et(e.charCodeAt(t)))return!1;return!0}function M2(e){for(let t=0;t<e.length;t++){const u=e.charCodeAt(t);if(u===10||u===13)return!0}return!1}function $0(e){let t="",u=!1;for(let n=0;n<e.length;n++)et(e.charCodeAt(n))?u||(t+=" ",u=!0):(t+=e[n],u=!1);return t}function Ds(e){(pe[0]||Hn).children.push(e)}function _e(e,t){return{start:ge.getPos(e),end:t==null?t:ge.getPos(t),source:t==null?t:Fe(e,t)}}function P2(e){return _e(e.start.offset,e.end.offset)}function bu(e,t){e.end=ge.getPos(t),e.source=Fe(e.start.offset,t)}function L2(e){const t={type:6,name:e.rawName,nameLoc:_e(e.loc.start.offset,e.loc.start.offset+e.rawName.length),value:void 0,loc:e.loc};if(e.exp){const u=e.exp.loc;u.end.offset<e.loc.end.offset&&(u.start.offset--,u.start.column--,u.end.offset++,u.end.column++),t.value={type:2,content:e.exp.content,loc:u}}return t}function Cr(e,t=!1,u,n=0,r=0){return X(e,t,u,n)}function Ft(e,t,u){ie.onError(me(e,_e(t,t)))}function B2(){ge.reset(),Ne=null,ne=null,ze="",It=-1,hu=-1,pe.length=0}function V2(e,t){if(B2(),Lt=e,ie=te({},L0),t){let r;for(r in t)t[r]!=null&&(ie[r]=t[r])}ge.mode=ie.parseMode==="html"?1:ie.parseMode==="sfc"?2:0,ge.inXML=ie.ns===1||ie.ns===2;const u=t&&t.delimiters;u&&(ge.delimiterOpen=Kr(u[0]),ge.delimiterClose=Kr(u[1]));const n=Hn=a2([],e);return ge.parse(Lt),n.loc=_e(0,e.length),n.children=V0(n.children),Hn=null,n}function $2(e,t){kr(e,void 0,t,H0(e,e.children[0]))}function H0(e,t){const{children:u}=e;return u.length===1&&t.type===1&&!Gr(t)}function kr(e,t,u,n=!1,r=!1){const{children:i}=e,s=[];for(let a=0;a<i.length;a++){const f=i[a];if(f.type===1&&f.tagType===0){const p=n?0:tt(f,u);if(p>0){if(p>=2){f.codegenNode.patchFlag=-1,s.push(f);continue}}else{const d=f.codegenNode;if(d.type===13){const h=d.patchFlag;if((h===void 0||h===512||h===1)&&q0(f,u)>=2){const _=j0(f);_&&(d.props=u.hoist(_))}d.dynamicProps&&(d.dynamicProps=u.hoist(d.dynamicProps))}}}else if(f.type===12&&(n?0:tt(f,u))>=2){s.push(f);continue}if(f.type===1){const p=f.tagType===1;p&&u.scopes.vSlot++,kr(f,e,u,!1,r),p&&u.scopes.vSlot--}else if(f.type===11)kr(f,e,u,f.children.length===1,!0);else if(f.type===9)for(let p=0;p<f.branches.length;p++)kr(f.branches[p],e,u,f.branches[p].children.length===1,r)}let o=!1;if(s.length===i.length&&e.type===1){if(e.tagType===0&&e.codegenNode&&e.codegenNode.type===13&&$(e.codegenNode.children))e.codegenNode.children=c(Eu(e.codegenNode.children)),o=!0;else if(e.tagType===1&&e.codegenNode&&e.codegenNode.type===13&&e.codegenNode.children&&!$(e.codegenNode.children)&&e.codegenNode.children.type===15){const a=l(e.codegenNode,"default");a&&(a.returns=c(Eu(a.returns)),o=!0)}else if(e.tagType===3&&t&&t.type===1&&t.tagType===1&&t.codegenNode&&t.codegenNode.type===13&&t.codegenNode.children&&!$(t.codegenNode.children)&&t.codegenNode.children.type===15){const a=st(e,"slot",!0),f=a&&a.arg&&l(t.codegenNode,a.arg);f&&(f.returns=c(Eu(f.returns)),o=!0)}}if(!o)for(const a of s)a.codegenNode=u.cache(a.codegenNode);function c(a){const f=u.cache(a);return r&&u.hmr&&(f.needArraySpread=!0),f}function l(a,f){if(a.children&&!$(a.children)&&a.children.type===15){const p=a.children.properties.find(d=>d.key===f||d.key.content===f);return p&&p.value}}s.length&&u.transformHoist&&u.transformHoist(i,u,e)}function tt(e,t){const{constantCache:u}=t;switch(e.type){case 1:if(e.tagType!==0)return 0;const n=u.get(e);if(n!==void 0)return n;const r=e.codegenNode;if(r.type!==13||r.isBlock&&e.tag!=="svg"&&e.tag!=="foreignObject"&&e.tag!=="math")return 0;if(r.patchFlag===void 0){let s=3;const o=q0(e,t);if(o===0)return u.set(e,0),0;o<s&&(s=o);for(let c=0;c<e.children.length;c++){const l=tt(e.children[c],t);if(l===0)return u.set(e,0),0;l<s&&(s=l)}if(s>1)for(let c=0;c<e.props.length;c++){const l=e.props[c];if(l.type===7&&l.name==="bind"&&l.exp){const a=tt(l.exp,t);if(a===0)return u.set(e,0),0;a<s&&(s=a)}}if(r.isBlock){for(let c=0;c<e.props.length;c++)if(e.props[c].type===7)return u.set(e,0),0;t.removeHelper(Su),t.removeHelper(Xu(t.inSSR,r.isComponent)),r.isBlock=!1,t.helper(Yu(t.inSSR,r.isComponent))}return u.set(e,s),s}else return u.set(e,0),0;case 2:case 3:return 3;case 9:case 11:case 10:return 0;case 5:case 12:return tt(e.content,t);case 4:return e.constType;case 8:let i=3;for(let s=0;s<e.children.length;s++){const o=e.children[s];if(Z(o)||Ze(o))continue;const c=tt(o,t);if(c===0)return 0;c<i&&(i=c)}return i;case 20:return 2;default:return 0}}const H2=new Set([Ao,So,Ln,Xn]);function U0(e,t){if(e.type===14&&!Z(e.callee)&&H2.has(e.callee)){const u=e.arguments[0];if(u.type===4)return tt(u,t);if(u.type===14)return U0(u,t)}return 0}function q0(e,t){let u=3;const n=j0(e);if(n&&n.type===15){const{properties:r}=n;for(let i=0;i<r.length;i++){const{key:s,value:o}=r[i],c=tt(s,t);if(c===0)return c;c<u&&(u=c);let l;if(o.type===4?l=tt(o,t):o.type===14?l=U0(o,t):l=0,l===0)return l;l<u&&(u=l)}}return u}function j0(e){const t=e.codegenNode;if(t.type===13)return t.props}function U2(e,{filename:t="",prefixIdentifiers:u=!1,hoistStatic:n=!1,hmr:r=!1,cacheHandlers:i=!1,nodeTransforms:s=[],directiveTransforms:o={},transformHoist:c=null,isBuiltInComponent:l=we,isCustomElement:a=we,expressionPlugins:f=[],scopeId:p=null,slotted:d=!0,ssr:h=!1,inSSR:_=!1,ssrCssVars:v="",bindingMetadata:A=ee,inline:E=!1,isTS:m=!1,onError:g=Fo,onWarn:x=O0,compatConfig:k}){const I=t.replace(/\?.*$/,"").match(/([^/\\]+)\.\w+$/),w={filename:t,selfName:I&&Tu(de(I[1])),prefixIdentifiers:u,hoistStatic:n,hmr:r,cacheHandlers:i,nodeTransforms:s,directiveTransforms:o,transformHoist:c,isBuiltInComponent:l,isCustomElement:a,expressionPlugins:f,scopeId:p,slotted:d,ssr:h,inSSR:_,ssrCssVars:v,bindingMetadata:A,inline:E,isTS:m,onError:g,onWarn:x,compatConfig:k,root:e,helpers:new Map,components:new Set,directives:new Set,hoists:[],imports:[],cached:[],constantCache:new WeakMap,temps:0,identifiers:Object.create(null),scopes:{vFor:0,vSlot:0,vPre:0,vOnce:0},parent:null,grandParent:null,currentNode:e,childIndex:0,inVOnce:!1,helper(C){const S=w.helpers.get(C)||0;return w.helpers.set(C,S+1),C},removeHelper(C){const S=w.helpers.get(C);if(S){const R=S-1;R?w.helpers.set(C,R):w.helpers.delete(C)}},helperString(C){return`_${Zu[w.helper(C)]}`},replaceNode(C){w.parent.children[w.childIndex]=w.currentNode=C},removeNode(C){const S=w.parent.children,R=C?S.indexOf(C):w.currentNode?w.childIndex:-1;!C||C===w.currentNode?(w.currentNode=null,w.onNodeRemoved()):w.childIndex>R&&(w.childIndex--,w.onNodeRemoved()),w.parent.children.splice(R,1)},onNodeRemoved:we,addIdentifiers(C){},removeIdentifiers(C){},hoist(C){Z(C)&&(C=X(C)),w.hoists.push(C);const S=X(`_hoisted_${w.hoists.length}`,!1,C.loc,2);return S.hoisted=C,S},cache(C,S=!1,R=!1){const T=f2(w.cached.length,C,S,R);return w.cached.push(T),T}};return w.filters=new Set,w}function q2(e,t){const u=U2(e,t);Ai(e,u),t.hoistStatic&&$2(e,u),t.ssr||j2(e,u),e.helpers=new Set([...u.helpers.keys()]),e.components=[...u.components],e.directives=[...u.directives],e.imports=u.imports,e.hoists=u.hoists,e.temps=u.temps,e.cached=u.cached,e.transformed=!0,e.filters=[...u.filters]}function j2(e,t){const{helper:u}=t,{children:n}=e;if(n.length===1){const r=n[0];if(H0(e,r)&&r.codegenNode){const i=r.codegenNode;i.type===13&&wo(i,t),e.codegenNode=i}else e.codegenNode=r}else if(n.length>1){let r=64;e.codegenNode=Bn(t,u(Pn),void 0,e.children,r,void 0,void 0,!0,void 0,!1)}}function z2(e,t){let u=0;const n=()=>{u--};for(;u<e.children.length;u++){const r=e.children[u];Z(r)||(t.grandParent=t.parent,t.parent=e,t.childIndex=u,t.onNodeRemoved=n,Ai(r,t))}}function Ai(e,t){t.currentNode=e;const{nodeTransforms:u}=t,n=[];for(let i=0;i<u.length;i++){const s=u[i](e,t);if(s&&($(s)?n.push(...s):n.push(s)),t.currentNode)e=t.currentNode;else return}switch(e.type){case 3:t.ssr||t.helper(Yn);break;case 5:t.ssr||t.helper(Ci);break;case 9:for(let i=0;i<e.branches.length;i++)Ai(e.branches[i],t);break;case 10:case 11:case 1:case 0:z2(e,t);break}t.currentNode=e;let r=n.length;for(;r--;)n[r]()}function z0(e,t){const u=Z(e)?n=>n===e:n=>e.test(n);return(n,r)=>{if(n.type===1){const{props:i}=n;if(n.tagType===3&&i.some(k2))return;const s=[];for(let o=0;o<i.length;o++){const c=i[o];if(c.type===7&&u(c.name)){i.splice(o,1),o--;const l=t(n,c,r);l&&s.push(l)}}return s}}}const Si="/*@__PURE__*/",K0=e=>`${Zu[e]}: _${Zu[e]}`;function K2(e,{mode:t="function",prefixIdentifiers:u=t==="module",sourceMap:n=!1,filename:r="template.vue.html",scopeId:i=null,optimizeImports:s=!1,runtimeGlobalName:o="Vue",runtimeModuleName:c="vue",ssrRuntimeModuleName:l="vue/server-renderer",ssr:a=!1,isTS:f=!1,inSSR:p=!1}){const d={mode:t,prefixIdentifiers:u,sourceMap:n,filename:r,scopeId:i,optimizeImports:s,runtimeGlobalName:o,runtimeModuleName:c,ssrRuntimeModuleName:l,ssr:a,isTS:f,inSSR:p,source:e.source,code:"",column:1,line:1,offset:0,indentLevel:0,pure:!1,map:void 0,helper(_){return`_${Zu[_]}`},push(_,v=-2,A){d.code+=_},indent(){h(++d.indentLevel)},deindent(_=!1){_?--d.indentLevel:h(--d.indentLevel)},newline(){h(d.indentLevel)}};function h(_){d.push(`
-`+"  ".repeat(_),0)}return d}function W2(e,t={}){const u=K2(e,t);t.onContextCreated&&t.onContextCreated(u);const{mode:n,push:r,prefixIdentifiers:i,indent:s,deindent:o,newline:c,scopeId:l,ssr:a}=u,f=Array.from(e.helpers),p=f.length>0,d=!i&&n!=="module";G2(e,u);const _=a?"ssrRender":"render",A=(a?["_ctx","_push","_parent","_attrs"]:["_ctx","_cache"]).join(", ");if(r(`function ${_}(${A}) {`),s(),d&&(r("with (_ctx) {"),s(),p&&(r(`const { ${f.map(K0).join(", ")} } = _Vue
-`,-1),c())),e.components.length&&(Wi(e.components,"component",u),(e.directives.length||e.temps>0)&&c()),e.directives.length&&(Wi(e.directives,"directive",u),e.temps>0&&c()),e.filters&&e.filters.length&&(c(),Wi(e.filters,"filter",u),c()),e.temps>0){r("let ");for(let E=0;E<e.temps;E++)r(`${E>0?", ":""}_temp${E}`)}return(e.components.length||e.directives.length||e.temps)&&(r(`
-`,0),c()),a||r("return "),e.codegenNode?Le(e.codegenNode,u):r("null"),d&&(o(),r("}")),o(),r("}"),{ast:e,code:u.code,preamble:"",map:u.map?u.map.toJSON():void 0}}function G2(e,t){const{ssr:u,prefixIdentifiers:n,push:r,newline:i,runtimeModuleName:s,runtimeGlobalName:o,ssrRuntimeModuleName:c}=t,l=o,a=Array.from(e.helpers);if(a.length>0&&(r(`const _Vue = ${l}
-`,-1),e.hoists.length)){const f=[mo,bo,Yn,go,T0].filter(p=>a.includes(p)).map(K0).join(", ");r(`const { ${f} } = _Vue
-`,-1)}Z2(e.hoists,t),i(),r("return ")}function Wi(e,t,{helper:u,push:n,newline:r,isTS:i}){const s=u(t==="filter"?Eo:t==="component"?_o:xo);for(let o=0;o<e.length;o++){let c=e[o];const l=c.endsWith("__self");l&&(c=c.slice(0,-6)),n(`const ${$n(c,t)} = ${s}(${JSON.stringify(c)}${l?", true":""})${i?"!":""}`),o<e.length-1&&r()}}function Z2(e,t){if(!e.length)return;t.pure=!0;const{push:u,newline:n}=t;n();for(let r=0;r<e.length;r++){const i=e[r];i&&(u(`const _hoisted_${r+1} = `),Le(i,t),n())}t.pure=!1}function No(e,t){const u=e.length>3||!1;t.push("["),u&&t.indent(),Qn(e,t,u),u&&t.deindent(),t.push("]")}function Qn(e,t,u=!1,n=!0){const{push:r,newline:i}=t;for(let s=0;s<e.length;s++){const o=e[s];Z(o)?r(o,-3):$(o)?No(o,t):Le(o,t),s<e.length-1&&(u?(n&&r(","),i()):n&&r(", "))}}function Le(e,t){if(Z(e)){t.push(e,-3);return}if(Ze(e)){t.push(t.helper(e));return}switch(e.type){case 1:case 9:case 11:Le(e.codegenNode,t);break;case 2:J2(e,t);break;case 4:W0(e,t);break;case 5:Y2(e,t);break;case 12:Le(e.codegenNode,t);break;case 8:G0(e,t);break;case 3:Q2(e,t);break;case 13:em(e,t);break;case 14:um(e,t);break;case 15:nm(e,t);break;case 17:rm(e,t);break;case 18:im(e,t);break;case 19:sm(e,t);break;case 20:om(e,t);break;case 21:Qn(e.body,t,!0,!1);break}}function J2(e,t){t.push(JSON.stringify(e.content),-3,e)}function W0(e,t){const{content:u,isStatic:n}=e;t.push(n?JSON.stringify(u):u,-3,e)}function Y2(e,t){const{push:u,helper:n,pure:r}=t;r&&u(Si),u(`${n(Ci)}(`),Le(e.content,t),u(")")}function G0(e,t){for(let u=0;u<e.children.length;u++){const n=e.children[u];Z(n)?t.push(n,-3):Le(n,t)}}function X2(e,t){const{push:u}=t;if(e.type===8)u("["),G0(e,t),u("]");else if(e.isStatic){const n=Io(e.content)?e.content:JSON.stringify(e.content);u(n,-2,e)}else u(`[${e.content}]`,-3,e)}function Q2(e,t){const{push:u,helper:n,pure:r}=t;r&&u(Si),u(`${n(Yn)}(${JSON.stringify(e.content)})`,-3,e)}function em(e,t){const{push:u,helper:n,pure:r}=t,{tag:i,props:s,children:o,patchFlag:c,dynamicProps:l,directives:a,isBlock:f,disableTracking:p,isComponent:d}=e;let h;c&&(h=String(c)),a&&u(n(Co)+"("),f&&u(`(${n(Su)}(${p?"true":""}), `),r&&u(Si);const _=f?Xu(t.inSSR,d):Yu(t.inSSR,d);u(n(_)+"(",-2,e),Qn(tm([i,s,o,h,l]),t),u(")"),f&&u(")"),a&&(u(", "),Le(a,t),u(")"))}function tm(e){let t=e.length;for(;t--&&e[t]==null;);return e.slice(0,t+1).map(u=>u||"null")}function um(e,t){const{push:u,helper:n,pure:r}=t,i=Z(e.callee)?e.callee:n(e.callee);r&&u(Si),u(i+"(",-2,e),Qn(e.arguments,t),u(")")}function nm(e,t){const{push:u,indent:n,deindent:r,newline:i}=t,{properties:s}=e;if(!s.length){u("{}",-2,e);return}const o=s.length>1||!1;u(o?"{":"{ "),o&&n();for(let c=0;c<s.length;c++){const{key:l,value:a}=s[c];X2(l,t),u(": "),Le(a,t),c<s.length-1&&(u(","),i())}o&&r(),u(o?"}":" }")}function rm(e,t){No(e.elements,t)}function im(e,t){const{push:u,indent:n,deindent:r}=t,{params:i,returns:s,body:o,newline:c,isSlot:l}=e;l&&u(`_${Zu[Do]}(`),u("(",-2,e),$(i)?Qn(i,t):i&&Le(i,t),u(") => "),(c||o)&&(u("{"),n()),s?(c&&u("return "),$(s)?No(s,t):Le(s,t)):o&&Le(o,t),(c||o)&&(r(),u("}")),l&&(e.isNonScopedSlot&&u(", undefined, true"),u(")"))}function sm(e,t){const{test:u,consequent:n,alternate:r,newline:i}=e,{push:s,indent:o,deindent:c,newline:l}=t;if(u.type===4){const f=!Io(u.content);f&&s("("),W0(u,t),f&&s(")")}else s("("),Le(u,t),s(")");i&&o(),t.indentLevel++,i||s(" "),s("? "),Le(n,t),t.indentLevel--,i&&l(),i||s(" "),s(": ");const a=r.type===19;a||t.indentLevel++,Le(r,t),a||t.indentLevel--,i&&c(!0)}function om(e,t){const{push:u,helper:n,indent:r,deindent:i,newline:s}=t,{needPauseTracking:o,needArraySpread:c}=e;c&&u("[...("),u(`_cache[${e.index}] || (`),o&&(r(),u(`${n(zr)}(-1`),e.inVOnce&&u(", true"),u("),"),s(),u("(")),u(`_cache[${e.index}] = `),Le(e.value,t),o&&(u(`).cacheIndex = ${e.index},`),s(),u(`${n(zr)}(1),`),s(),u(`_cache[${e.index}]`),i()),u(")"),c&&u(")]")}new RegExp("\\b"+"arguments,await,break,case,catch,class,const,continue,debugger,default,delete,do,else,export,extends,finally,for,function,if,import,let,new,return,super,switch,throw,try,var,void,while,with,yield".split(",").join("\\b|\\b")+"\\b");const cm=z0(/^(if|else|else-if)$/,(e,t,u)=>lm(e,t,u,(n,r,i)=>{const s=u.parent.children;let o=s.indexOf(n),c=0;for(;o-->=0;){const l=s[o];l&&l.type===9&&(c+=l.branches.length)}return()=>{if(i)n.codegenNode=jc(r,c,u);else{const l=am(n.codegenNode);l.alternate=jc(r,c+n.branches.length-1,u)}}}));function lm(e,t,u,n){if(t.name!=="else"&&(!t.exp||!t.exp.content.trim())){const r=t.exp?t.exp.loc:e.loc;u.onError(me(28,t.loc)),t.exp=X("true",!1,r)}if(t.name==="if"){const r=qc(e,t),i={type:9,loc:P2(e.loc),branches:[r]};if(u.replaceNode(i),n)return n(i,r,!0)}else{const r=u.parent.children;let i=r.indexOf(e);for(;i-->=-1;){const s=r[i];if(s&&s.type===3){u.removeNode(s);continue}if(s&&s.type===2&&!s.content.trim().length){u.removeNode(s);continue}if(s&&s.type===9){t.name==="else-if"&&s.branches[s.branches.length-1].condition===void 0&&u.onError(me(30,e.loc)),u.removeNode();const o=qc(e,t);s.branches.push(o);const c=n&&n(s,o,!1);Ai(o,u),c&&c(),u.currentNode=null}else u.onError(me(30,e.loc));break}}}function qc(e,t){const u=e.tagType===3;return{type:10,loc:e.loc,condition:t.name==="else"?void 0:t.exp,children:u&&!st(e,"for")?e.children:[e],userKey:ki(e,"key"),isTemplateIf:u}}function jc(e,t,u){return e.condition?Ss(e.condition,zc(e,t,u),Ae(u.helper(Yn),['""',"true"])):zc(e,t,u)}function zc(e,t,u){const{helper:n}=u,r=xe("key",X(`${t}`,!1,rt,2)),{children:i}=e,s=i[0];if(i.length!==1||s.type!==1)if(i.length===1&&s.type===11){const c=s.codegenNode;return Zr(c,r,u),c}else return Bn(u,n(Pn),ot([r]),i,64,void 0,void 0,!0,!1,!1,e.loc);else{const c=s.codegenNode,l=S2(c);return l.type===13&&wo(l,u),Zr(l,r,u),c}}function am(e){for(;;)if(e.type===19)if(e.alternate.type===19)e=e.alternate;else return e;else e.type===20&&(e=e.value)}const fm=(e,t,u)=>{const{modifiers:n,loc:r}=e,i=e.arg;let{exp:s}=e;if(s&&s.type===4&&!s.content.trim()&&(s=void 0),!s){if(i.type!==4||!i.isStatic)return u.onError(me(52,i.loc)),{props:[xe(i,X("",!0,r))]};Z0(e),s=e.exp}return i.type!==4?(i.children.unshift("("),i.children.push(') || ""')):i.isStatic||(i.content=`${i.content} || ""`),n.some(o=>o.content==="camel")&&(i.type===4?i.isStatic?i.content=de(i.content):i.content=`${u.helperString(ks)}(${i.content})`:(i.children.unshift(`${u.helperString(ks)}(`),i.children.push(")"))),u.inSSR||(n.some(o=>o.content==="prop")&&Kc(i,"."),n.some(o=>o.content==="attr")&&Kc(i,"^")),{props:[xe(i,s)]}},Z0=(e,t)=>{const u=e.arg,n=de(u.content);e.exp=X(n,!1,u.loc)},Kc=(e,t)=>{e.type===4?e.isStatic?e.content=t+e.content:e.content=`\`${t}\${${e.content}}\``:(e.children.unshift(`'${t}' + (`),e.children.push(")"))},dm=z0("for",(e,t,u)=>{const{helper:n,removeHelper:r}=u;return hm(e,t,u,i=>{const s=Ae(n(ko),[i.source]),o=Wr(e),c=st(e,"memo"),l=ki(e,"key",!1,!0);l&&l.type===7&&!l.exp&&Z0(l);let f=l&&(l.type===6?l.value?X(l.value.content,!0):void 0:l.exp);const p=l&&f?xe("key",f):null,d=i.source.type===4&&i.source.constType>0,h=d?64:l?128:256;return i.codegenNode=Bn(u,n(Pn),void 0,s,h,void 0,void 0,!0,!d,!1,e.loc),()=>{let _;const{children:v}=i,A=v.length!==1||v[0].type!==1,E=Gr(e)?e:o&&e.children.length===1&&Gr(e.children[0])?e.children[0]:null;if(E?(_=E.codegenNode,o&&p&&Zr(_,p,u)):A?_=Bn(u,n(Pn),p?ot([p]):void 0,e.children,64,void 0,void 0,!0,void 0,!1):(_=v[0].codegenNode,o&&p&&Zr(_,p,u),_.isBlock!==!d&&(_.isBlock?(r(Su),r(Xu(u.inSSR,_.isComponent))):r(Yu(u.inSSR,_.isComponent))),_.isBlock=!d,_.isBlock?(n(Su),n(Xu(u.inSSR,_.isComponent))):n(Yu(u.inSSR,_.isComponent))),c){const m=Ju(Ts(i.parseResult,[X("_cached")]));m.body=d2([gt(["const _memo = (",c.exp,")"]),gt(["if (_cached",...f?[" && _cached.key === ",f]:[],` && ${u.helperString(I0)}(_cached, _memo)) return _cached`]),gt(["const _item = ",_]),X("_item.memo = _memo"),X("return _item")]),s.arguments.push(m,X("_cache"),X(String(u.cached.length))),u.cached.push(null)}else s.arguments.push(Ju(Ts(i.parseResult),_,!0))}})});function hm(e,t,u,n){if(!t.exp){u.onError(me(31,t.loc));return}const r=t.forParseResult;if(!r){u.onError(me(32,t.loc));return}J0(r);const{addIdentifiers:i,removeIdentifiers:s,scopes:o}=u,{source:c,value:l,key:a,index:f}=r,p={type:11,loc:t.loc,source:c,valueAlias:l,keyAlias:a,objectIndexAlias:f,parseResult:r,children:Wr(e)?e.children:[e]};u.replaceNode(p),o.vFor++;const d=n&&n(p);return()=>{o.vFor--,d&&d()}}function J0(e,t){e.finalized||(e.finalized=!0)}function Ts({value:e,key:t,index:u},n=[]){return pm([e,t,u,...n])}function pm(e){let t=e.length;for(;t--&&!e[t];);return e.slice(0,t+1).map((u,n)=>u||X("_".repeat(n+1),!1))}const Wc=X("undefined",!1),mm=(e,t)=>{if(e.type===1&&(e.tagType===1||e.tagType===3)){const u=st(e,"slot");if(u)return u.exp,t.scopes.vSlot++,()=>{t.scopes.vSlot--}}},bm=(e,t,u,n)=>Ju(e,u,!1,!0,u.length?u[0].loc:n);function gm(e,t,u=bm){t.helper(Do);const{children:n,loc:r}=e,i=[],s=[];let o=t.scopes.vSlot>0||t.scopes.vFor>0;const c=st(e,"slot",!0);if(c){const{arg:v,exp:A}=c;v&&!Ge(v)&&(o=!0),i.push(xe(v||X("default",!0),u(A,void 0,n,r)))}let l=!1,a=!1;const f=[],p=new Set;let d=0;for(let v=0;v<n.length;v++){const A=n[v];let E;if(!Wr(A)||!(E=st(A,"slot",!0))){A.type!==3&&f.push(A);continue}if(c){t.onError(me(37,E.loc));break}l=!0;const{children:m,loc:g}=A,{arg:x=X("default",!0),exp:k,loc:I}=E;let w;Ge(x)?w=x?x.content:"default":o=!0;const C=st(A,"for"),S=u(k,C,m,g);let R,T;if(R=st(A,"if"))o=!0,s.push(Ss(R.exp,hr(x,S,d++),Wc));else if(T=st(A,/^else(-if)?$/,!0)){let P=v,q;for(;P--&&(q=n[P],q.type===3););if(q&&Wr(q)&&st(q,/^(else-)?if$/)){let z=s[s.length-1];for(;z.alternate.type===19;)z=z.alternate;z.alternate=T.exp?Ss(T.exp,hr(x,S,d++),Wc):hr(x,S,d++)}else t.onError(me(30,T.loc))}else if(C){o=!0;const P=C.forParseResult;P?(J0(P),s.push(Ae(t.helper(ko),[P.source,Ju(Ts(P),hr(x,S),!0)]))):t.onError(me(32,C.loc))}else{if(w){if(p.has(w)){t.onError(me(38,I));continue}p.add(w),w==="default"&&(a=!0)}i.push(xe(x,S))}}if(!c){const v=(A,E)=>{const m=u(A,void 0,E,r);return t.compatConfig&&(m.isNonScopedSlot=!0),xe("default",m)};l?f.length&&f.some(A=>Y0(A))&&(a?t.onError(me(39,f[0].loc)):i.push(v(void 0,f))):i.push(v(void 0,n))}const h=o?2:Ar(e.children)?3:1;let _=ot(i.concat(xe("_",X(h+"",!1))),r);return s.length&&(_=Ae(t.helper(F0),[_,Eu(s)])),{slots:_,hasDynamicSlots:o}}function hr(e,t,u){const n=[xe("name",e),xe("fn",t)];return u!=null&&n.push(xe("key",X(String(u),!0))),ot(n)}function Ar(e){for(let t=0;t<e.length;t++){const u=e[t];switch(u.type){case 1:if(u.tagType===2||Ar(u.children))return!0;break;case 9:if(Ar(u.branches))return!0;break;case 10:case 11:if(Ar(u.children))return!0;break}}return!1}function Y0(e){return e.type!==2&&e.type!==12?!0:e.type===2?!!e.content.trim():Y0(e.content)}const X0=new WeakMap,_m=(e,t)=>function(){if(e=t.currentNode,!(e.type===1&&(e.tagType===0||e.tagType===1)))return;const{tag:n,props:r}=e,i=e.tagType===1;let s=i?ym(e,t):`"${n}"`;const o=oe(s)&&s.callee===yo;let c,l,a=0,f,p,d,h=o||s===Sn||s===po||!i&&(n==="svg"||n==="foreignObject"||n==="math");if(r.length>0){const _=Q0(e,t,void 0,i,o);c=_.props,a=_.patchFlag,p=_.dynamicPropNames;const v=_.directives;d=v&&v.length?Eu(v.map(A=>Em(A,t))):void 0,_.shouldUseBlock&&(h=!0)}if(e.children.length>0)if(s===qr&&(h=!0,a|=1024),i&&s!==Sn&&s!==qr){const{slots:v,hasDynamicSlots:A}=gm(e,t);l=v,A&&(a|=1024)}else if(e.children.length===1&&s!==Sn){const v=e.children[0],A=v.type,E=A===5||A===8;E&&tt(v,t)===0&&(a|=1),E||A===2?l=v:l=e.children}else l=e.children;p&&p.length&&(f=Cm(p)),e.codegenNode=Bn(t,s,c,l,a===0?void 0:a,f,d,!!h,!1,i,e.loc)};function ym(e,t,u=!1){let{tag:n}=e;const r=ws(n),i=ki(e,"is",!1,!0);if(i)if(r||Cu("COMPILER_IS_ON_ELEMENT",t)){let o;if(i.type===6?o=i.value&&X(i.value.content,!0):(o=i.exp,o||(o=X("is",!1,i.arg.loc))),o)return Ae(t.helper(yo),[o])}else i.type===6&&i.value.content.startsWith("vue:")&&(n=i.value.content.slice(4));const s=N0(n)||t.isBuiltInComponent(n);return s?(u||t.helper(s),s):(t.helper(_o),t.components.add(n),$n(n,"component"))}function Q0(e,t,u=e.props,n,r,i=!1){const{tag:s,loc:o,children:c}=e;let l=[];const a=[],f=[],p=c.length>0;let d=!1,h=0,_=!1,v=!1,A=!1,E=!1,m=!1,g=!1;const x=[],k=S=>{l.length&&(a.push(ot(Gc(l),o)),l=[]),S&&a.push(S)},I=()=>{t.scopes.vFor>0&&l.push(xe(X("ref_for",!0),X("true")))},w=({key:S,value:R})=>{if(Ge(S)){const T=S.content,P=vu(T);if(P&&(!n||r)&&T.toLowerCase()!=="onclick"&&T!=="onUpdate:modelValue"&&!tu(T)&&(E=!0),P&&tu(T)&&(g=!0),P&&R.type===14&&(R=R.arguments[0]),R.type===20||(R.type===4||R.type===8)&&tt(R,t)>0)return;T==="ref"?_=!0:T==="class"?v=!0:T==="style"?A=!0:T!=="key"&&!x.includes(T)&&x.push(T),n&&(T==="class"||T==="style")&&!x.includes(T)&&x.push(T)}else m=!0};for(let S=0;S<u.length;S++){const R=u[S];if(R.type===6){const{loc:T,name:P,nameLoc:q,value:z}=R;let H=!0;if(P==="ref"&&(_=!0,I()),P==="is"&&(ws(s)||z&&z.content.startsWith("vue:")||Cu("COMPILER_IS_ON_ELEMENT",t)))continue;l.push(xe(X(P,!0,q),X(z?z.content:"",H,z?z.loc:T)))}else{const{name:T,arg:P,exp:q,loc:z,modifiers:H}=R,J=T==="bind",j=T==="on";if(T==="slot"){n||t.onError(me(40,z));continue}if(T==="once"||T==="memo"||T==="is"||J&&mu(P,"is")&&(ws(s)||Cu("COMPILER_IS_ON_ELEMENT",t))||j&&i)continue;if((J&&mu(P,"key")||j&&p&&mu(P,"vue:before-update"))&&(d=!0),J&&mu(P,"ref")&&I(),!P&&(J||j)){if(m=!0,q)if(J){if(I(),k(),Cu("COMPILER_V_BIND_OBJECT_ORDER",t)){a.unshift(q);continue}a.push(q)}else k({type:14,loc:z,callee:t.helper(vo),arguments:n?[q]:[q,"true"]});else t.onError(me(J?34:35,z));continue}J&&H.some(je=>je.content==="prop")&&(h|=32);const be=t.directiveTransforms[T];if(be){const{props:je,needRuntime:dt}=be(R,e,t);!i&&je.forEach(w),j&&P&&!Ge(P)?k(ot(je,o)):l.push(...je),dt&&(f.push(R),Ze(dt)&&X0.set(R,dt))}else zf(T)||(f.push(R),p&&(d=!0))}}let C;if(a.length?(k(),a.length>1?C=Ae(t.helper(jr),a,o):C=a[0]):l.length&&(C=ot(Gc(l),o)),m?h|=16:(v&&!n&&(h|=2),A&&!n&&(h|=4),x.length&&(h|=8),E&&(h|=32)),!d&&(h===0||h===32)&&(_||g||f.length>0)&&(h|=512),!t.inSSR&&C)switch(C.type){case 15:let S=-1,R=-1,T=!1;for(let z=0;z<C.properties.length;z++){const H=C.properties[z].key;Ge(H)?H.content==="class"?S=z:H.content==="style"&&(R=z):H.isHandlerKey||(T=!0)}const P=C.properties[S],q=C.properties[R];T?C=Ae(t.helper(Ln),[C]):(P&&!Ge(P.value)&&(P.value=Ae(t.helper(Ao),[P.value])),q&&(A||q.value.type===4&&q.value.content.trim()[0]==="["||q.value.type===17)&&(q.value=Ae(t.helper(So),[q.value])));break;case 14:break;default:C=Ae(t.helper(Ln),[Ae(t.helper(Xn),[C])]);break}return{props:C,directives:f,patchFlag:h,dynamicPropNames:x,shouldUseBlock:d}}function Gc(e){const t=new Map,u=[];for(let n=0;n<e.length;n++){const r=e[n];if(r.key.type===8||!r.key.isStatic){u.push(r);continue}const i=r.key.content,s=t.get(i);s?(i==="style"||i==="class"||vu(i))&&xm(s,r):(t.set(i,r),u.push(r))}return u}function xm(e,t){e.value.type===17?e.value.elements.push(t.value):e.value=Eu([e.value,t.value],e.loc)}function Em(e,t){const u=[],n=X0.get(e);n?u.push(t.helperString(n)):(t.helper(xo),t.directives.add(e.name),u.push($n(e.name,"directive")));const{loc:r}=e;if(e.exp&&u.push(e.exp),e.arg&&(e.exp||u.push("void 0"),u.push(e.arg)),Object.keys(e.modifiers).length){e.arg||(e.exp||u.push("void 0"),u.push("void 0"));const i=X("true",!1,r);u.push(ot(e.modifiers.map(s=>xe(s,i)),r))}return Eu(u,e.loc)}function Cm(e){let t="[";for(let u=0,n=e.length;u<n;u++)t+=JSON.stringify(e[u]),u<n-1&&(t+=", ");return t+"]"}function ws(e){return e==="component"||e==="Component"}const km=(e,t)=>{if(Gr(e)){const{children:u,loc:n}=e,{slotName:r,slotProps:i}=Am(e,t),s=[t.prefixIdentifiers?"_ctx.$slots":"$slots",r,"{}","undefined","true"];let o=2;i&&(s[2]=i,o=3),u.length&&(s[3]=Ju([],u,!1,!1,n),o=4),t.scopeId&&!t.slotted&&(o=5),s.splice(o),e.codegenNode=Ae(t.helper(w0),s,n)}};function Am(e,t){let u='"default"',n;const r=[];for(let i=0;i<e.props.length;i++){const s=e.props[i];if(s.type===6)s.value&&(s.name==="name"?u=JSON.stringify(s.value.content):(s.name=de(s.name),r.push(s)));else if(s.name==="bind"&&mu(s.arg,"name")){if(s.exp)u=s.exp;else if(s.arg&&s.arg.type===4){const o=de(s.arg.content);u=s.exp=X(o,!1,s.arg.loc)}}else s.name==="bind"&&s.arg&&Ge(s.arg)&&(s.arg.content=de(s.arg.content)),r.push(s)}if(r.length>0){const{props:i,directives:s}=Q0(e,t,r,!1,!1);n=i,s.length&&t.onError(me(36,s[0].loc))}return{slotName:u,slotProps:n}}const ef=(e,t,u,n)=>{const{loc:r,modifiers:i,arg:s}=e;!e.exp&&!i.length&&u.onError(me(35,r));let o;if(s.type===4)if(s.isStatic){let f=s.content;f.startsWith("vue:")&&(f=`vnode-${f.slice(4)}`);const p=t.tagType!==0||f.startsWith("vnode")||!/[A-Z]/.test(f)?Bu(de(f)):`on:${f}`;o=X(p,!0,s.loc)}else o=gt([`${u.helperString(As)}(`,s,")"]);else o=s,o.children.unshift(`${u.helperString(As)}(`),o.children.push(")");let c=e.exp;c&&!c.content.trim()&&(c=void 0);let l=u.cacheHandlers&&!c&&!u.inVOnce;if(c){const f=M0(c),p=!(f||E2(c)),d=c.content.includes(";");(p||l&&f)&&(c=gt([`${p?"$event":"(...args)"} => ${d?"{":"("}`,c,d?"}":")"]))}let a={props:[xe(o,c||X("() => {}",!1,r))]};return n&&(a=n(a)),l&&(a.props[0].value=u.cache(a.props[0].value)),a.props.forEach(f=>f.key.isHandlerKey=!0),a},Sm=(e,t)=>{if(e.type===0||e.type===1||e.type===11||e.type===10)return()=>{const u=e.children;let n,r=!1;for(let i=0;i<u.length;i++){const s=u[i];if(Ki(s)){r=!0;for(let o=i+1;o<u.length;o++){const c=u[o];if(Ki(c))n||(n=u[i]=gt([s],s.loc)),n.children.push(" + ",c),u.splice(o,1),o--;else{n=void 0;break}}}}if(!(!r||u.length===1&&(e.type===0||e.type===1&&e.tagType===0&&!e.props.find(i=>i.type===7&&!t.directiveTransforms[i.name])&&e.tag!=="template")))for(let i=0;i<u.length;i++){const s=u[i];if(Ki(s)||s.type===8){const o=[];(s.type!==2||s.content!==" ")&&o.push(s),!t.ssr&&tt(s,t)===0&&o.push("1"),u[i]={type:12,content:s,loc:s.loc,codegenNode:Ae(t.helper(go),o)}}}}},Zc=new WeakSet,vm=(e,t)=>{if(e.type===1&&st(e,"once",!0))return Zc.has(e)||t.inVOnce||t.inSSR?void 0:(Zc.add(e),t.inVOnce=!0,t.helper(zr),()=>{t.inVOnce=!1;const u=t.currentNode;u.codegenNode&&(u.codegenNode=t.cache(u.codegenNode,!0,!0))})},tf=(e,t,u)=>{const{exp:n,arg:r}=e;if(!n)return u.onError(me(41,e.loc)),pr();const i=n.loc.source.trim(),s=n.type===4?n.content:i,o=u.bindingMetadata[i];if(o==="props"||o==="props-aliased")return u.onError(me(44,n.loc)),pr();if(!s.trim()||!M0(n)&&!!1)return u.onError(me(42,n.loc)),pr();const l=r||X("modelValue",!0),a=r?Ge(r)?`onUpdate:${de(r.content)}`:gt(['"onUpdate:" + ',r]):"onUpdate:modelValue";let f;const p=u.isTS?"($event: any)":"$event";f=gt([`${p} => ((`,n,") = $event)"]);const d=[xe(l,e.exp),xe(a,f)];if(e.modifiers.length&&t.tagType===1){const h=e.modifiers.map(v=>v.content).map(v=>(Io(v)?v:JSON.stringify(v))+": true").join(", "),_=r?Ge(r)?`${r.content}Modifiers`:gt([r,' + "Modifiers"']):"modelModifiers";d.push(xe(_,X(`{ ${h} }`,!1,e.loc,2)))}return pr(d)};function pr(e=[]){return{props:e}}const Dm=/[\w).+\-_$\]]/,Tm=(e,t)=>{Cu("COMPILER_FILTERS",t)&&(e.type===5?Jr(e.content,t):e.type===1&&e.props.forEach(u=>{u.type===7&&u.name!=="for"&&u.exp&&Jr(u.exp,t)}))};function Jr(e,t){if(e.type===4)Jc(e,t);else for(let u=0;u<e.children.length;u++){const n=e.children[u];typeof n=="object"&&(n.type===4?Jc(n,t):n.type===8?Jr(e,t):n.type===5&&Jr(n.content,t))}}function Jc(e,t){const u=e.content;let n=!1,r=!1,i=!1,s=!1,o=0,c=0,l=0,a=0,f,p,d,h,_=[];for(d=0;d<u.length;d++)if(p=f,f=u.charCodeAt(d),n)f===39&&p!==92&&(n=!1);else if(r)f===34&&p!==92&&(r=!1);else if(i)f===96&&p!==92&&(i=!1);else if(s)f===47&&p!==92&&(s=!1);else if(f===124&&u.charCodeAt(d+1)!==124&&u.charCodeAt(d-1)!==124&&!o&&!c&&!l)h===void 0?(a=d+1,h=u.slice(0,d).trim()):v();else{switch(f){case 34:r=!0;break;case 39:n=!0;break;case 96:i=!0;break;case 40:l++;break;case 41:l--;break;case 91:c++;break;case 93:c--;break;case 123:o++;break;case 125:o--;break}if(f===47){let A=d-1,E;for(;A>=0&&(E=u.charAt(A),E===" ");A--);(!E||!Dm.test(E))&&(s=!0)}}h===void 0?h=u.slice(0,d).trim():a!==0&&v();function v(){_.push(u.slice(a,d).trim()),a=d+1}if(_.length){for(d=0;d<_.length;d++)h=wm(h,_[d],t);e.content=h,e.ast=void 0}}function wm(e,t,u){u.helper(Eo);const n=t.indexOf("(");if(n<0)return u.filters.add(t),`${$n(t,"filter")}(${e})`;{const r=t.slice(0,n),i=t.slice(n+1);return u.filters.add(r),`${$n(r,"filter")}(${e}${i!==")"?","+i:i}`}}const Yc=new WeakSet,Fm=(e,t)=>{if(e.type===1){const u=st(e,"memo");return!u||Yc.has(e)?void 0:(Yc.add(e),()=>{const n=e.codegenNode||t.currentNode.codegenNode;n&&n.type===13&&(e.tagType!==1&&wo(n,t),e.codegenNode=Ae(t.helper(To),[u.exp,Ju(void 0,n),"_cache",String(t.cached.length)]),t.cached.push(null))})}};function Im(e){return[[vm,cm,Fm,dm,Tm,km,_m,mm,Sm],{on:ef,bind:fm,model:tf}]}function Om(e,t={}){const u=t.onError||Fo,n=t.mode==="module";t.prefixIdentifiers===!0?u(me(47)):n&&u(me(48));const r=!1;t.cacheHandlers&&u(me(49)),t.scopeId&&!n&&u(me(50));const i=te({},t,{prefixIdentifiers:r}),s=Z(e)?V2(e,i):e,[o,c]=Im();return q2(s,te({},i,{nodeTransforms:[...o,...t.nodeTransforms||[]],directiveTransforms:te({},c,t.directiveTransforms||{})})),W2(s,i)}const Nm=()=>({props:[]});/**
-* @vue/compiler-dom v3.5.13
-* (c) 2018-present Yuxi (Evan) You and Vue contributors
-* @license MIT
-**/const uf=Symbol(""),nf=Symbol(""),rf=Symbol(""),sf=Symbol(""),Fs=Symbol(""),of=Symbol(""),cf=Symbol(""),lf=Symbol(""),af=Symbol(""),ff=Symbol("");l2({[uf]:"vModelRadio",[nf]:"vModelCheckbox",[rf]:"vModelText",[sf]:"vModelSelect",[Fs]:"vModelDynamic",[of]:"withModifiers",[cf]:"withKeys",[lf]:"vShow",[af]:"Transition",[ff]:"TransitionGroup"});let Nu;function Rm(e,t=!1){return Nu||(Nu=document.createElement("div")),t?(Nu.innerHTML=`<div foo="${e.replace(/"/g,"&quot;")}">`,Nu.children[0].getAttribute("foo")):(Nu.innerHTML=e,Nu.textContent)}const Mm={parseMode:"html",isVoidTag:ld,isNativeTag:e=>sd(e)||od(e)||cd(e),isPreTag:e=>e==="pre",isIgnoreNewlineTag:e=>e==="pre"||e==="textarea",decodeEntities:Rm,isBuiltInComponent:e=>{if(e==="Transition"||e==="transition")return af;if(e==="TransitionGroup"||e==="transition-group")return ff},getNamespace(e,t,u){let n=t?t.ns:u;if(t&&n===2)if(t.tag==="annotation-xml"){if(e==="svg")return 1;t.props.some(r=>r.type===6&&r.name==="encoding"&&r.value!=null&&(r.value.content==="text/html"||r.value.content==="application/xhtml+xml"))&&(n=0)}else/^m(?:[ions]|text)$/.test(t.tag)&&e!=="mglyph"&&e!=="malignmark"&&(n=0);else t&&n===1&&(t.tag==="foreignObject"||t.tag==="desc"||t.tag==="title")&&(n=0);if(n===0){if(e==="svg")return 1;if(e==="math")return 2}return n}},Pm=e=>{e.type===1&&e.props.forEach((t,u)=>{t.type===6&&t.name==="style"&&t.value&&(e.props[u]={type:7,name:"bind",arg:X("style",!0,t.loc),exp:Lm(t.value.content,t.loc),modifiers:[],loc:t.loc})})},Lm=(e,t)=>{const u=xl(e);return X(JSON.stringify(u),!1,t,3)};function iu(e,t){return me(e,t)}const Bm=(e,t,u)=>{const{exp:n,loc:r}=e;return n||u.onError(iu(53,r)),t.children.length&&(u.onError(iu(54,r)),t.children.length=0),{props:[xe(X("innerHTML",!0,r),n||X("",!0))]}},Vm=(e,t,u)=>{const{exp:n,loc:r}=e;return n||u.onError(iu(55,r)),t.children.length&&(u.onError(iu(56,r)),t.children.length=0),{props:[xe(X("textContent",!0),n?tt(n,u)>0?n:Ae(u.helperString(Ci),[n],r):X("",!0))]}},$m=(e,t,u)=>{const n=tf(e,t,u);if(!n.props.length||t.tagType===1)return n;e.arg&&u.onError(iu(58,e.arg.loc));const{tag:r}=t,i=u.isCustomElement(r);if(r==="input"||r==="textarea"||r==="select"||i){let s=rf,o=!1;if(r==="input"||i){const c=ki(t,"type");if(c){if(c.type===7)s=Fs;else if(c.value)switch(c.value.content){case"radio":s=uf;break;case"checkbox":s=nf;break;case"file":o=!0,u.onError(iu(59,e.loc));break}}else C2(t)&&(s=Fs)}else r==="select"&&(s=sf);o||(n.needRuntime=u.helper(s))}else u.onError(iu(57,e.loc));return n.props=n.props.filter(s=>!(s.key.type===4&&s.key.content==="modelValue")),n},Hm=Be("passive,once,capture"),Um=Be("stop,prevent,self,ctrl,shift,alt,meta,exact,middle"),qm=Be("left,right"),df=Be("onkeyup,onkeydown,onkeypress"),jm=(e,t,u,n)=>{const r=[],i=[],s=[];for(let o=0;o<t.length;o++){const c=t[o].content;c==="native"&&Vn("COMPILER_V_ON_NATIVE",u)||Hm(c)?s.push(c):qm(c)?Ge(e)?df(e.content.toLowerCase())?r.push(c):i.push(c):(r.push(c),i.push(c)):Um(c)?i.push(c):r.push(c)}return{keyModifiers:r,nonKeyModifiers:i,eventOptionModifiers:s}},Xc=(e,t)=>Ge(e)&&e.content.toLowerCase()==="onclick"?X(t,!0):e.type!==4?gt(["(",e,`) === "onClick" ? "${t}" : (`,e,")"]):e,zm=(e,t,u)=>ef(e,t,u,n=>{const{modifiers:r}=e;if(!r.length)return n;let{key:i,value:s}=n.props[0];const{keyModifiers:o,nonKeyModifiers:c,eventOptionModifiers:l}=jm(i,r,u,e.loc);if(c.includes("right")&&(i=Xc(i,"onContextmenu")),c.includes("middle")&&(i=Xc(i,"onMouseup")),c.length&&(s=Ae(u.helper(of),[s,JSON.stringify(c)])),o.length&&(!Ge(i)||df(i.content.toLowerCase()))&&(s=Ae(u.helper(cf),[s,JSON.stringify(o)])),l.length){const a=l.map(Tu).join("");i=Ge(i)?X(`${i.content}${a}`,!0):gt(["(",i,`) + "${a}"`])}return{props:[xe(i,s)]}}),Km=(e,t,u)=>{const{exp:n,loc:r}=e;return n||u.onError(iu(61,r)),{props:[],needRuntime:u.helper(lf)}},Wm=(e,t)=>{e.type===1&&e.tagType===0&&(e.tag==="script"||e.tag==="style")&&t.removeNode()},Gm=[Pm],Zm={cloak:Nm,html:Bm,text:Vm,model:$m,on:zm,show:Km};function Jm(e,t={}){return Om(e,te({},Mm,t,{nodeTransforms:[Wm,...Gm,...t.nodeTransforms||[]],directiveTransforms:te({},Zm,t.directiveTransforms||{}),transformHoist:null}))}/**
-* vue v3.5.13
-* (c) 2018-present Yuxi (Evan) You and Vue contributors
-* @license MIT
-**/const Qc=Object.create(null);function Ym(e,t){if(!Z(e))if(e.nodeType)e=e.innerHTML;else return we;const u=Gf(e,t),n=Qc[u];if(n)return n;if(e[0]==="#"){const o=document.querySelector(e);e=o?o.innerHTML:""}const r=te({hoistStatic:!0,onError:void 0,onWarn:we},t);!r.isCustomElement&&typeof customElements<"u"&&(r.isCustomElement=o=>!!customElements.get(o));const{code:i}=Jm(e,r),s=new Function("Vue",i)(n2);return s._rc=!0,Qc[u]=s}Ya(Ym);const Xm={stream:!0,temperature:.2};let Qm=null;class el extends Error{constructor(t,u,n){super(t),this.name=u}}async function*eb(e,t={},u={}){var p;let n=u.controller;const r=((p=u.api_url)==null?void 0:p.replace(/\/+$/,""))||"";n||(n=new AbortController);const i={...Xm,...t,prompt:e},s=await fetch(`${r}${u.endpoint||"/completion"}`,{method:"POST",body:JSON.stringify(i),headers:{Connection:"keep-alive","Content-Type":"application/json",Accept:"text/event-stream",...t.api_key?{Authorization:`Bearer ${t.api_key}`}:{}},signal:n.signal});if(s.status!==200)try{const d=await s.json();if(d&&d.error&&d.error.message)throw new el(d.error.message,"ServerError")}catch(d){throw new el(d.message,"ServerError")}const c=s.body.getReader(),l=new TextDecoder;let a="",f="";try{let d=!0;for(;d;){const h=await c.read();if(h.done)break;const _=f+l.decode(h.value),v=_.endsWith(`
-`);let A=_.split(`
-`);v?f="":f=A.pop();const E=/^(\S+):\s(.*)$/gm;for(const m of A){const g=E.exec(m);if(g){if(h[g[1]]=g[2],h.data==="[DONE]"){d=!1;break}if(h.data&&(h.data=JSON.parse(h.data),a+=h.data.content,yield h,h.data.stop)){h.data.generation_settings&&(Qm=h.data.generation_settings),d=!1;break}if(h.error)try{if(h.error=JSON.parse(h.error),h.error.message.includes("slot unavailable"))throw new Error("slot unavailable");console.error(`llama.cpp error [${h.error.code} - ${h.error.type}]: ${h.error.message}`)}catch{console.error(`llama.cpp error ${h.error}`)}}}}}catch(d){throw d.name!=="AbortError"&&console.error("llama error: ",d),d}finally{n.abort()}return a}const tl={};function tb(e){let t=tl[e];if(t)return t;t=tl[e]=[];for(let u=0;u<128;u++){const n=String.fromCharCode(u);t.push(n)}for(let u=0;u<e.length;u++){const n=e.charCodeAt(u);t[n]="%"+("0"+n.toString(16).toUpperCase()).slice(-2)}return t}function Qu(e,t){typeof t!="string"&&(t=Qu.defaultChars);const u=tb(t);return e.replace(/(%[a-f0-9]{2})+/gi,function(n){let r="";for(let i=0,s=n.length;i<s;i+=3){const o=parseInt(n.slice(i+1,i+3),16);if(o<128){r+=u[o];continue}if((o&224)===192&&i+3<s){const c=parseInt(n.slice(i+4,i+6),16);if((c&192)===128){const l=o<<6&1984|c&63;l<128?r+="��":r+=String.fromCharCode(l),i+=3;continue}}if((o&240)===224&&i+6<s){const c=parseInt(n.slice(i+4,i+6),16),l=parseInt(n.slice(i+7,i+9),16);if((c&192)===128&&(l&192)===128){const a=o<<12&61440|c<<6&4032|l&63;a<2048||a>=55296&&a<=57343?r+="���":r+=String.fromCharCode(a),i+=6;continue}}if((o&248)===240&&i+9<s){const c=parseInt(n.slice(i+4,i+6),16),l=parseInt(n.slice(i+7,i+9),16),a=parseInt(n.slice(i+10,i+12),16);if((c&192)===128&&(l&192)===128&&(a&192)===128){let f=o<<18&1835008|c<<12&258048|l<<6&4032|a&63;f<65536||f>1114111?r+="����":(f-=65536,r+=String.fromCharCode(55296+(f>>10),56320+(f&1023))),i+=9;continue}}r+="�"}return r})}Qu.defaultChars=";/?:@&=+$,#";Qu.componentChars="";const ul={};function ub(e){let t=ul[e];if(t)return t;t=ul[e]=[];for(let u=0;u<128;u++){const n=String.fromCharCode(u);/^[0-9a-z]$/i.test(n)?t.push(n):t.push("%"+("0"+u.toString(16).toUpperCase()).slice(-2))}for(let u=0;u<e.length;u++)t[e.charCodeAt(u)]=e[u];return t}function er(e,t,u){typeof t!="string"&&(u=t,t=er.defaultChars),typeof u>"u"&&(u=!0);const n=ub(t);let r="";for(let i=0,s=e.length;i<s;i++){const o=e.charCodeAt(i);if(u&&o===37&&i+2<s&&/^[0-9a-f]{2}$/i.test(e.slice(i+1,i+3))){r+=e.slice(i,i+3),i+=2;continue}if(o<128){r+=n[o];continue}if(o>=55296&&o<=57343){if(o>=55296&&o<=56319&&i+1<s){const c=e.charCodeAt(i+1);if(c>=56320&&c<=57343){r+=encodeURIComponent(e[i]+e[i+1]),i++;continue}}r+="%EF%BF%BD";continue}r+=encodeURIComponent(e[i])}return r}er.defaultChars=";/?:@&=+$,-_.!~*'()#";er.componentChars="-_.!~*'()";function Ro(e){let t="";return t+=e.protocol||"",t+=e.slashes?"//":"",t+=e.auth?e.auth+"@":"",e.hostname&&e.hostname.indexOf(":")!==-1?t+="["+e.hostname+"]":t+=e.hostname||"",t+=e.port?":"+e.port:"",t+=e.pathname||"",t+=e.search||"",t+=e.hash||"",t}function Yr(){this.protocol=null,this.slashes=null,this.auth=null,this.port=null,this.hostname=null,this.hash=null,this.search=null,this.pathname=null}const nb=/^([a-z0-9.+-]+:)/i,rb=/:[0-9]*$/,ib=/^(\/\/?(?!\/)[^\?\s]*)(\?[^\s]*)?$/,sb=["<",">",'"',"`"," ","\r",`
-`,"	"],ob=["{","}","|","\\","^","`"].concat(sb),cb=["'"].concat(ob),nl=["%","/","?",";","#"].concat(cb),rl=["/","?","#"],lb=255,il=/^[+a-z0-9A-Z_-]{0,63}$/,ab=/^([+a-z0-9A-Z_-]{0,63})(.*)$/,sl={javascript:!0,"javascript:":!0},ol={http:!0,https:!0,ftp:!0,gopher:!0,file:!0,"http:":!0,"https:":!0,"ftp:":!0,"gopher:":!0,"file:":!0};function Mo(e,t){if(e&&e instanceof Yr)return e;const u=new Yr;return u.parse(e,t),u}Yr.prototype.parse=function(e,t){let u,n,r,i=e;if(i=i.trim(),!t&&e.split("#").length===1){const l=ib.exec(i);if(l)return this.pathname=l[1],l[2]&&(this.search=l[2]),this}let s=nb.exec(i);if(s&&(s=s[0],u=s.toLowerCase(),this.protocol=s,i=i.substr(s.length)),(t||s||i.match(/^\/\/[^@\/]+@[^@\/]+/))&&(r=i.substr(0,2)==="//",r&&!(s&&sl[s])&&(i=i.substr(2),this.slashes=!0)),!sl[s]&&(r||s&&!ol[s])){let l=-1;for(let h=0;h<rl.length;h++)n=i.indexOf(rl[h]),n!==-1&&(l===-1||n<l)&&(l=n);let a,f;l===-1?f=i.lastIndexOf("@"):f=i.lastIndexOf("@",l),f!==-1&&(a=i.slice(0,f),i=i.slice(f+1),this.auth=a),l=-1;for(let h=0;h<nl.length;h++)n=i.indexOf(nl[h]),n!==-1&&(l===-1||n<l)&&(l=n);l===-1&&(l=i.length),i[l-1]===":"&&l--;const p=i.slice(0,l);i=i.slice(l),this.parseHost(p),this.hostname=this.hostname||"";const d=this.hostname[0]==="["&&this.hostname[this.hostname.length-1]==="]";if(!d){const h=this.hostname.split(/\./);for(let _=0,v=h.length;_<v;_++){const A=h[_];if(A&&!A.match(il)){let E="";for(let m=0,g=A.length;m<g;m++)A.charCodeAt(m)>127?E+="x":E+=A[m];if(!E.match(il)){const m=h.slice(0,_),g=h.slice(_+1),x=A.match(ab);x&&(m.push(x[1]),g.unshift(x[2])),g.length&&(i=g.join(".")+i),this.hostname=m.join(".");break}}}}this.hostname.length>lb&&(this.hostname=""),d&&(this.hostname=this.hostname.substr(1,this.hostname.length-2))}const o=i.indexOf("#");o!==-1&&(this.hash=i.substr(o),i=i.slice(0,o));const c=i.indexOf("?");return c!==-1&&(this.search=i.substr(c),i=i.slice(0,c)),i&&(this.pathname=i),ol[u]&&this.hostname&&!this.pathname&&(this.pathname=""),this};Yr.prototype.parseHost=function(e){let t=rb.exec(e);t&&(t=t[0],t!==":"&&(this.port=t.substr(1)),e=e.substr(0,e.length-t.length)),e&&(this.hostname=e)};const fb=Object.freeze(Object.defineProperty({__proto__:null,decode:Qu,encode:er,format:Ro,parse:Mo},Symbol.toStringTag,{value:"Module"})),hf=/[\0-\uD7FF\uE000-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?:[^\uD800-\uDBFF]|^)[\uDC00-\uDFFF]/,pf=/[\0-\x1F\x7F-\x9F]/,db=/[\xAD\u0600-\u0605\u061C\u06DD\u070F\u0890\u0891\u08E2\u180E\u200B-\u200F\u202A-\u202E\u2060-\u2064\u2066-\u206F\uFEFF\uFFF9-\uFFFB]|\uD804[\uDCBD\uDCCD]|\uD80D[\uDC30-\uDC3F]|\uD82F[\uDCA0-\uDCA3]|\uD834[\uDD73-\uDD7A]|\uDB40[\uDC01\uDC20-\uDC7F]/,Po=/[!-#%-\*,-\/:;\?@\[-\]_\{\}\xA1\xA7\xAB\xB6\xB7\xBB\xBF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061D-\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u09FD\u0A76\u0AF0\u0C77\u0C84\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1360-\u1368\u1400\u166E\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1B7D\u1B7E\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CC0-\u1CC7\u1CD3\u2010-\u2027\u2030-\u2043\u2045-\u2051\u2053-\u205E\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E4F\u2E52-\u2E5D\u3001-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]|\uD800[\uDD00-\uDD02\uDF9F\uDFD0]|\uD801\uDD6F|\uD802[\uDC57\uDD1F\uDD3F\uDE50-\uDE58\uDE7F\uDEF0-\uDEF6\uDF39-\uDF3F\uDF99-\uDF9C]|\uD803[\uDEAD\uDF55-\uDF59\uDF86-\uDF89]|\uD804[\uDC47-\uDC4D\uDCBB\uDCBC\uDCBE-\uDCC1\uDD40-\uDD43\uDD74\uDD75\uDDC5-\uDDC8\uDDCD\uDDDB\uDDDD-\uDDDF\uDE38-\uDE3D\uDEA9]|\uD805[\uDC4B-\uDC4F\uDC5A\uDC5B\uDC5D\uDCC6\uDDC1-\uDDD7\uDE41-\uDE43\uDE60-\uDE6C\uDEB9\uDF3C-\uDF3E]|\uD806[\uDC3B\uDD44-\uDD46\uDDE2\uDE3F-\uDE46\uDE9A-\uDE9C\uDE9E-\uDEA2\uDF00-\uDF09]|\uD807[\uDC41-\uDC45\uDC70\uDC71\uDEF7\uDEF8\uDF43-\uDF4F\uDFFF]|\uD809[\uDC70-\uDC74]|\uD80B[\uDFF1\uDFF2]|\uD81A[\uDE6E\uDE6F\uDEF5\uDF37-\uDF3B\uDF44]|\uD81B[\uDE97-\uDE9A\uDFE2]|\uD82F\uDC9F|\uD836[\uDE87-\uDE8B]|\uD83A[\uDD5E\uDD5F]/,mf=/[\$\+<->\^`\|~\xA2-\xA6\xA8\xA9\xAC\xAE-\xB1\xB4\xB8\xD7\xF7\u02C2-\u02C5\u02D2-\u02DF\u02E5-\u02EB\u02ED\u02EF-\u02FF\u0375\u0384\u0385\u03F6\u0482\u058D-\u058F\u0606-\u0608\u060B\u060E\u060F\u06DE\u06E9\u06FD\u06FE\u07F6\u07FE\u07FF\u0888\u09F2\u09F3\u09FA\u09FB\u0AF1\u0B70\u0BF3-\u0BFA\u0C7F\u0D4F\u0D79\u0E3F\u0F01-\u0F03\u0F13\u0F15-\u0F17\u0F1A-\u0F1F\u0F34\u0F36\u0F38\u0FBE-\u0FC5\u0FC7-\u0FCC\u0FCE\u0FCF\u0FD5-\u0FD8\u109E\u109F\u1390-\u1399\u166D\u17DB\u1940\u19DE-\u19FF\u1B61-\u1B6A\u1B74-\u1B7C\u1FBD\u1FBF-\u1FC1\u1FCD-\u1FCF\u1FDD-\u1FDF\u1FED-\u1FEF\u1FFD\u1FFE\u2044\u2052\u207A-\u207C\u208A-\u208C\u20A0-\u20C0\u2100\u2101\u2103-\u2106\u2108\u2109\u2114\u2116-\u2118\u211E-\u2123\u2125\u2127\u2129\u212E\u213A\u213B\u2140-\u2144\u214A-\u214D\u214F\u218A\u218B\u2190-\u2307\u230C-\u2328\u232B-\u2426\u2440-\u244A\u249C-\u24E9\u2500-\u2767\u2794-\u27C4\u27C7-\u27E5\u27F0-\u2982\u2999-\u29D7\u29DC-\u29FB\u29FE-\u2B73\u2B76-\u2B95\u2B97-\u2BFF\u2CE5-\u2CEA\u2E50\u2E51\u2E80-\u2E99\u2E9B-\u2EF3\u2F00-\u2FD5\u2FF0-\u2FFF\u3004\u3012\u3013\u3020\u3036\u3037\u303E\u303F\u309B\u309C\u3190\u3191\u3196-\u319F\u31C0-\u31E3\u31EF\u3200-\u321E\u322A-\u3247\u3250\u3260-\u327F\u328A-\u32B0\u32C0-\u33FF\u4DC0-\u4DFF\uA490-\uA4C6\uA700-\uA716\uA720\uA721\uA789\uA78A\uA828-\uA82B\uA836-\uA839\uAA77-\uAA79\uAB5B\uAB6A\uAB6B\uFB29\uFBB2-\uFBC2\uFD40-\uFD4F\uFDCF\uFDFC-\uFDFF\uFE62\uFE64-\uFE66\uFE69\uFF04\uFF0B\uFF1C-\uFF1E\uFF3E\uFF40\uFF5C\uFF5E\uFFE0-\uFFE6\uFFE8-\uFFEE\uFFFC\uFFFD]|\uD800[\uDD37-\uDD3F\uDD79-\uDD89\uDD8C-\uDD8E\uDD90-\uDD9C\uDDA0\uDDD0-\uDDFC]|\uD802[\uDC77\uDC78\uDEC8]|\uD805\uDF3F|\uD807[\uDFD5-\uDFF1]|\uD81A[\uDF3C-\uDF3F\uDF45]|\uD82F\uDC9C|\uD833[\uDF50-\uDFC3]|\uD834[\uDC00-\uDCF5\uDD00-\uDD26\uDD29-\uDD64\uDD6A-\uDD6C\uDD83\uDD84\uDD8C-\uDDA9\uDDAE-\uDDEA\uDE00-\uDE41\uDE45\uDF00-\uDF56]|\uD835[\uDEC1\uDEDB\uDEFB\uDF15\uDF35\uDF4F\uDF6F\uDF89\uDFA9\uDFC3]|\uD836[\uDC00-\uDDFF\uDE37-\uDE3A\uDE6D-\uDE74\uDE76-\uDE83\uDE85\uDE86]|\uD838[\uDD4F\uDEFF]|\uD83B[\uDCAC\uDCB0\uDD2E\uDEF0\uDEF1]|\uD83C[\uDC00-\uDC2B\uDC30-\uDC93\uDCA0-\uDCAE\uDCB1-\uDCBF\uDCC1-\uDCCF\uDCD1-\uDCF5\uDD0D-\uDDAD\uDDE6-\uDE02\uDE10-\uDE3B\uDE40-\uDE48\uDE50\uDE51\uDE60-\uDE65\uDF00-\uDFFF]|\uD83D[\uDC00-\uDED7\uDEDC-\uDEEC\uDEF0-\uDEFC\uDF00-\uDF76\uDF7B-\uDFD9\uDFE0-\uDFEB\uDFF0]|\uD83E[\uDC00-\uDC0B\uDC10-\uDC47\uDC50-\uDC59\uDC60-\uDC87\uDC90-\uDCAD\uDCB0\uDCB1\uDD00-\uDE53\uDE60-\uDE6D\uDE70-\uDE7C\uDE80-\uDE88\uDE90-\uDEBD\uDEBF-\uDEC5\uDECE-\uDEDB\uDEE0-\uDEE8\uDEF0-\uDEF8\uDF00-\uDF92\uDF94-\uDFCA]/,bf=/[ \xA0\u1680\u2000-\u200A\u2028\u2029\u202F\u205F\u3000]/,hb=Object.freeze(Object.defineProperty({__proto__:null,Any:hf,Cc:pf,Cf:db,P:Po,S:mf,Z:bf},Symbol.toStringTag,{value:"Module"})),pb=new Uint16Array('ᵁ<Õıʊҝջאٵ۞ޢߖࠏ੊ઑඡ๭༉༦჊ረዡᐕᒝᓃᓟᔥ\0\0\0\0\0\0ᕫᛍᦍᰒᷝ὾⁠↰⊍⏀⏻⑂⠤⤒ⴈ⹈⿎〖㊺㘹㞬㣾㨨㩱㫠㬮ࠀEMabcfglmnoprstu\\bfms¦³¹ÈÏlig耻Æ䃆P耻&䀦cute耻Á䃁reve;䄂Āiyx}rc耻Â䃂;䐐r;쀀𝔄rave耻À䃀pha;䎑acr;䄀d;橓Āgp¡on;䄄f;쀀𝔸plyFunction;恡ing耻Å䃅Ācs¾Ãr;쀀𝒜ign;扔ilde耻Ã䃃ml耻Ä䃄ЀaceforsuåûþėĜĢħĪĀcrêòkslash;或Ŷöø;櫧ed;挆y;䐑ƀcrtąċĔause;戵noullis;愬a;䎒r;쀀𝔅pf;쀀𝔹eve;䋘còēmpeq;扎܀HOacdefhilorsuōőŖƀƞƢƵƷƺǜȕɳɸɾcy;䐧PY耻©䂩ƀcpyŝŢźute;䄆Ā;iŧŨ拒talDifferentialD;慅leys;愭ȀaeioƉƎƔƘron;䄌dil耻Ç䃇rc;䄈nint;戰ot;䄊ĀdnƧƭilla;䂸terDot;䂷òſi;䎧rcleȀDMPTǇǋǑǖot;抙inus;抖lus;投imes;抗oĀcsǢǸkwiseContourIntegral;戲eCurlyĀDQȃȏoubleQuote;思uote;怙ȀlnpuȞȨɇɕonĀ;eȥȦ户;橴ƀgitȯȶȺruent;扡nt;戯ourIntegral;戮ĀfrɌɎ;愂oduct;成nterClockwiseContourIntegral;戳oss;樯cr;쀀𝒞pĀ;Cʄʅ拓ap;才րDJSZacefiosʠʬʰʴʸˋ˗ˡ˦̳ҍĀ;oŹʥtrahd;椑cy;䐂cy;䐅cy;䐏ƀgrsʿ˄ˇger;怡r;憡hv;櫤Āayː˕ron;䄎;䐔lĀ;t˝˞戇a;䎔r;쀀𝔇Āaf˫̧Ācm˰̢riticalȀADGT̖̜̀̆cute;䂴oŴ̋̍;䋙bleAcute;䋝rave;䁠ilde;䋜ond;拄ferentialD;慆Ѱ̽\0\0\0͔͂\0Ѕf;쀀𝔻ƀ;DE͈͉͍䂨ot;惜qual;扐blèCDLRUVͣͲ΂ϏϢϸontourIntegraìȹoɴ͹\0\0ͻ»͉nArrow;懓Āeo·ΤftƀARTΐΖΡrrow;懐ightArrow;懔eåˊngĀLRΫτeftĀARγιrrow;柸ightArrow;柺ightArrow;柹ightĀATϘϞrrow;懒ee;抨pɁϩ\0\0ϯrrow;懑ownArrow;懕erticalBar;戥ǹABLRTaВЪаўѿͼrrowƀ;BUНОТ憓ar;椓pArrow;懵reve;䌑eft˒к\0ц\0ѐightVector;楐eeVector;楞ectorĀ;Bљњ憽ar;楖ightǔѧ\0ѱeeVector;楟ectorĀ;BѺѻ懁ar;楗eeĀ;A҆҇护rrow;憧ĀctҒҗr;쀀𝒟rok;䄐ࠀNTacdfglmopqstuxҽӀӄӋӞӢӧӮӵԡԯԶՒ՝ՠեG;䅊H耻Ð䃐cute耻É䃉ƀaiyӒӗӜron;䄚rc耻Ê䃊;䐭ot;䄖r;쀀𝔈rave耻È䃈ement;戈ĀapӺӾcr;䄒tyɓԆ\0\0ԒmallSquare;旻erySmallSquare;斫ĀgpԦԪon;䄘f;쀀𝔼silon;䎕uĀaiԼՉlĀ;TՂՃ橵ilde;扂librium;懌Āci՗՚r;愰m;橳a;䎗ml耻Ë䃋Āipժկsts;戃onentialE;慇ʀcfiosօֈ֍ֲ׌y;䐤r;쀀𝔉lledɓ֗\0\0֣mallSquare;旼erySmallSquare;斪Ͱֺ\0ֿ\0\0ׄf;쀀𝔽All;戀riertrf;愱cò׋؀JTabcdfgorstר׬ׯ׺؀ؒؖ؛؝أ٬ٲcy;䐃耻>䀾mmaĀ;d׷׸䎓;䏜reve;䄞ƀeiy؇،ؐdil;䄢rc;䄜;䐓ot;䄠r;쀀𝔊;拙pf;쀀𝔾eater̀EFGLSTصلَٖٛ٦qualĀ;Lؾؿ扥ess;招ullEqual;执reater;檢ess;扷lantEqual;橾ilde;扳cr;쀀𝒢;扫ЀAacfiosuڅڋږڛڞڪھۊRDcy;䐪Āctڐڔek;䋇;䁞irc;䄤r;愌lbertSpace;愋ǰگ\0ڲf;愍izontalLine;攀Āctۃۅòکrok;䄦mpńېۘownHumðįqual;扏܀EJOacdfgmnostuۺ۾܃܇܎ܚܞܡܨ݄ݸދޏޕcy;䐕lig;䄲cy;䐁cute耻Í䃍Āiyܓܘrc耻Î䃎;䐘ot;䄰r;愑rave耻Ì䃌ƀ;apܠܯܿĀcgܴܷr;䄪inaryI;慈lieóϝǴ݉\0ݢĀ;eݍݎ戬Āgrݓݘral;戫section;拂isibleĀCTݬݲomma;恣imes;恢ƀgptݿރވon;䄮f;쀀𝕀a;䎙cr;愐ilde;䄨ǫޚ\0ޞcy;䐆l耻Ï䃏ʀcfosuެ޷޼߂ߐĀiyޱ޵rc;䄴;䐙r;쀀𝔍pf;쀀𝕁ǣ߇\0ߌr;쀀𝒥rcy;䐈kcy;䐄΀HJacfosߤߨ߽߬߱ࠂࠈcy;䐥cy;䐌ppa;䎚Āey߶߻dil;䄶;䐚r;쀀𝔎pf;쀀𝕂cr;쀀𝒦րJTaceflmostࠥࠩࠬࡐࡣ঳সে্਷ੇcy;䐉耻<䀼ʀcmnpr࠷࠼ࡁࡄࡍute;䄹bda;䎛g;柪lacetrf;愒r;憞ƀaeyࡗ࡜ࡡron;䄽dil;䄻;䐛Āfsࡨ॰tԀACDFRTUVarࡾࢩࢱࣦ࣠ࣼयज़ΐ४Ānrࢃ࢏gleBracket;柨rowƀ;BR࢙࢚࢞憐ar;懤ightArrow;懆eiling;挈oǵࢷ\0ࣃbleBracket;柦nǔࣈ\0࣒eeVector;楡ectorĀ;Bࣛࣜ懃ar;楙loor;挊ightĀAV࣯ࣵrrow;憔ector;楎Āerँगeƀ;AVउऊऐ抣rrow;憤ector;楚iangleƀ;BEतथऩ抲ar;槏qual;抴pƀDTVषूौownVector;楑eeVector;楠ectorĀ;Bॖॗ憿ar;楘ectorĀ;B॥०憼ar;楒ightáΜs̀EFGLSTॾঋকঝঢভqualGreater;拚ullEqual;扦reater;扶ess;檡lantEqual;橽ilde;扲r;쀀𝔏Ā;eঽা拘ftarrow;懚idot;䄿ƀnpw৔ਖਛgȀLRlr৞৷ਂਐeftĀAR০৬rrow;柵ightArrow;柷ightArrow;柶eftĀarγਊightáοightáϊf;쀀𝕃erĀLRਢਬeftArrow;憙ightArrow;憘ƀchtਾੀੂòࡌ;憰rok;䅁;扪Ѐacefiosuਗ਼੝੠੷੼અઋ઎p;椅y;䐜Ādl੥੯iumSpace;恟lintrf;愳r;쀀𝔐nusPlus;戓pf;쀀𝕄cò੶;䎜ҀJacefostuણધભીଔଙඑ඗ඞcy;䐊cute;䅃ƀaey઴હાron;䅇dil;䅅;䐝ƀgswે૰଎ativeƀMTV૓૟૨ediumSpace;怋hiĀcn૦૘ë૙eryThiî૙tedĀGL૸ଆreaterGreateòٳessLesóੈLine;䀊r;쀀𝔑ȀBnptଢନଷ଺reak;恠BreakingSpace;䂠f;愕ڀ;CDEGHLNPRSTV୕ୖ୪୼஡௫ఄ౞಄ದ೘ൡඅ櫬Āou୛୤ngruent;扢pCap;扭oubleVerticalBar;戦ƀlqxஃஊ஛ement;戉ualĀ;Tஒஓ扠ilde;쀀≂̸ists;戄reater΀;EFGLSTஶஷ஽௉௓௘௥扯qual;扱ullEqual;쀀≧̸reater;쀀≫̸ess;批lantEqual;쀀⩾̸ilde;扵umpń௲௽ownHump;쀀≎̸qual;쀀≏̸eĀfsఊధtTriangleƀ;BEచఛడ拪ar;쀀⧏̸qual;括s̀;EGLSTవశ఼ౄోౘ扮qual;扰reater;扸ess;쀀≪̸lantEqual;쀀⩽̸ilde;扴estedĀGL౨౹reaterGreater;쀀⪢̸essLess;쀀⪡̸recedesƀ;ESಒಓಛ技qual;쀀⪯̸lantEqual;拠ĀeiಫಹverseElement;戌ghtTriangleƀ;BEೋೌ೒拫ar;쀀⧐̸qual;拭ĀquೝഌuareSuĀbp೨೹setĀ;E೰ೳ쀀⊏̸qual;拢ersetĀ;Eഃആ쀀⊐̸qual;拣ƀbcpഓതൎsetĀ;Eഛഞ쀀⊂⃒qual;抈ceedsȀ;ESTലള഻െ抁qual;쀀⪰̸lantEqual;拡ilde;쀀≿̸ersetĀ;E൘൛쀀⊃⃒qual;抉ildeȀ;EFT൮൯൵ൿ扁qual;扄ullEqual;扇ilde;扉erticalBar;戤cr;쀀𝒩ilde耻Ñ䃑;䎝܀Eacdfgmoprstuvලෂ෉෕ෛ෠෧෼ขภยา฿ไlig;䅒cute耻Ó䃓Āiy෎ීrc耻Ô䃔;䐞blac;䅐r;쀀𝔒rave耻Ò䃒ƀaei෮ෲ෶cr;䅌ga;䎩cron;䎟pf;쀀𝕆enCurlyĀDQฎบoubleQuote;怜uote;怘;橔Āclวฬr;쀀𝒪ash耻Ø䃘iŬื฼de耻Õ䃕es;樷ml耻Ö䃖erĀBP๋๠Āar๐๓r;怾acĀek๚๜;揞et;掴arenthesis;揜Ҁacfhilors๿ງຊຏຒດຝະ໼rtialD;戂y;䐟r;쀀𝔓i;䎦;䎠usMinus;䂱Āipຢອncareplanåڝf;愙Ȁ;eio຺ູ໠໤檻cedesȀ;EST່້໏໚扺qual;檯lantEqual;扼ilde;找me;怳Ādp໩໮uct;戏ortionĀ;aȥ໹l;戝Āci༁༆r;쀀𝒫;䎨ȀUfos༑༖༛༟OT耻"䀢r;쀀𝔔pf;愚cr;쀀𝒬؀BEacefhiorsu༾གྷཇའཱིྦྷྪྭ႖ႩႴႾarr;椐G耻®䂮ƀcnrཎནབute;䅔g;柫rĀ;tཛྷཝ憠l;椖ƀaeyཧཬཱron;䅘dil;䅖;䐠Ā;vླྀཹ愜erseĀEUྂྙĀlq྇ྎement;戋uilibrium;懋pEquilibrium;楯r»ཹo;䎡ghtЀACDFTUVa࿁࿫࿳ဢဨၛႇϘĀnr࿆࿒gleBracket;柩rowƀ;BL࿜࿝࿡憒ar;懥eftArrow;懄eiling;按oǵ࿹\0စbleBracket;柧nǔည\0နeeVector;楝ectorĀ;Bဝသ懂ar;楕loor;挋Āerိ၃eƀ;AVဵံြ抢rrow;憦ector;楛iangleƀ;BEၐၑၕ抳ar;槐qual;抵pƀDTVၣၮၸownVector;楏eeVector;楜ectorĀ;Bႂႃ憾ar;楔ectorĀ;B႑႒懀ar;楓Āpuႛ႞f;愝ndImplies;楰ightarrow;懛ĀchႹႼr;愛;憱leDelayed;槴ڀHOacfhimoqstuფჱჷჽᄙᄞᅑᅖᅡᅧᆵᆻᆿĀCcჩხHcy;䐩y;䐨FTcy;䐬cute;䅚ʀ;aeiyᄈᄉᄎᄓᄗ檼ron;䅠dil;䅞rc;䅜;䐡r;쀀𝔖ortȀDLRUᄪᄴᄾᅉownArrow»ОeftArrow»࢚ightArrow»࿝pArrow;憑gma;䎣allCircle;战pf;쀀𝕊ɲᅭ\0\0ᅰt;戚areȀ;ISUᅻᅼᆉᆯ斡ntersection;抓uĀbpᆏᆞsetĀ;Eᆗᆘ抏qual;抑ersetĀ;Eᆨᆩ抐qual;抒nion;抔cr;쀀𝒮ar;拆ȀbcmpᇈᇛሉላĀ;sᇍᇎ拐etĀ;Eᇍᇕqual;抆ĀchᇠህeedsȀ;ESTᇭᇮᇴᇿ扻qual;檰lantEqual;扽ilde;承Tháྌ;我ƀ;esሒሓሣ拑rsetĀ;Eሜም抃qual;抇et»ሓրHRSacfhiorsሾቄ቉ቕ቞ቱቶኟዂወዑORN耻Þ䃞ADE;愢ĀHc቎ቒcy;䐋y;䐦Ābuቚቜ;䀉;䎤ƀaeyብቪቯron;䅤dil;䅢;䐢r;쀀𝔗Āeiቻ኉ǲኀ\0ኇefore;戴a;䎘Ācn኎ኘkSpace;쀀  Space;怉ldeȀ;EFTካኬኲኼ戼qual;扃ullEqual;扅ilde;扈pf;쀀𝕋ipleDot;惛Āctዖዛr;쀀𝒯rok;䅦ૡዷጎጚጦ\0ጬጱ\0\0\0\0\0ጸጽ፷ᎅ\0᏿ᐄᐊᐐĀcrዻጁute耻Ú䃚rĀ;oጇገ憟cir;楉rǣጓ\0጖y;䐎ve;䅬Āiyጞጣrc耻Û䃛;䐣blac;䅰r;쀀𝔘rave耻Ù䃙acr;䅪Ādiፁ፩erĀBPፈ፝Āarፍፐr;䁟acĀekፗፙ;揟et;掵arenthesis;揝onĀ;P፰፱拃lus;抎Āgp፻፿on;䅲f;쀀𝕌ЀADETadps᎕ᎮᎸᏄϨᏒᏗᏳrrowƀ;BDᅐᎠᎤar;椒ownArrow;懅ownArrow;憕quilibrium;楮eeĀ;AᏋᏌ报rrow;憥ownáϳerĀLRᏞᏨeftArrow;憖ightArrow;憗iĀ;lᏹᏺ䏒on;䎥ing;䅮cr;쀀𝒰ilde;䅨ml耻Ü䃜ҀDbcdefosvᐧᐬᐰᐳᐾᒅᒊᒐᒖash;披ar;櫫y;䐒ashĀ;lᐻᐼ抩;櫦Āerᑃᑅ;拁ƀbtyᑌᑐᑺar;怖Ā;iᑏᑕcalȀBLSTᑡᑥᑪᑴar;戣ine;䁼eparator;杘ilde;所ThinSpace;怊r;쀀𝔙pf;쀀𝕍cr;쀀𝒱dash;抪ʀcefosᒧᒬᒱᒶᒼirc;䅴dge;拀r;쀀𝔚pf;쀀𝕎cr;쀀𝒲Ȁfiosᓋᓐᓒᓘr;쀀𝔛;䎞pf;쀀𝕏cr;쀀𝒳ҀAIUacfosuᓱᓵᓹᓽᔄᔏᔔᔚᔠcy;䐯cy;䐇cy;䐮cute耻Ý䃝Āiyᔉᔍrc;䅶;䐫r;쀀𝔜pf;쀀𝕐cr;쀀𝒴ml;䅸ЀHacdefosᔵᔹᔿᕋᕏᕝᕠᕤcy;䐖cute;䅹Āayᕄᕉron;䅽;䐗ot;䅻ǲᕔ\0ᕛoWidtè૙a;䎖r;愨pf;愤cr;쀀𝒵௡ᖃᖊᖐ\0ᖰᖶᖿ\0\0\0\0ᗆᗛᗫᙟ᙭\0ᚕ᚛ᚲᚹ\0ᚾcute耻á䃡reve;䄃̀;Ediuyᖜᖝᖡᖣᖨᖭ戾;쀀∾̳;房rc耻â䃢te肻´̆;䐰lig耻æ䃦Ā;r²ᖺ;쀀𝔞rave耻à䃠ĀepᗊᗖĀfpᗏᗔsym;愵èᗓha;䎱ĀapᗟcĀclᗤᗧr;䄁g;樿ɤᗰ\0\0ᘊʀ;adsvᗺᗻᗿᘁᘇ戧nd;橕;橜lope;橘;橚΀;elmrszᘘᘙᘛᘞᘿᙏᙙ戠;榤e»ᘙsdĀ;aᘥᘦ戡ѡᘰᘲᘴᘶᘸᘺᘼᘾ;榨;榩;榪;榫;榬;榭;榮;榯tĀ;vᙅᙆ戟bĀ;dᙌᙍ抾;榝Āptᙔᙗh;戢»¹arr;捼Āgpᙣᙧon;䄅f;쀀𝕒΀;Eaeiop዁ᙻᙽᚂᚄᚇᚊ;橰cir;橯;扊d;手s;䀧roxĀ;e዁ᚒñᚃing耻å䃥ƀctyᚡᚦᚨr;쀀𝒶;䀪mpĀ;e዁ᚯñʈilde耻ã䃣ml耻ä䃤Āciᛂᛈoninôɲnt;樑ࠀNabcdefiklnoprsu᛭ᛱᜰ᜼ᝃᝈ᝸᝽០៦ᠹᡐᜍ᤽᥈ᥰot;櫭Ācrᛶ᜞kȀcepsᜀᜅᜍᜓong;扌psilon;䏶rime;怵imĀ;e᜚᜛戽q;拍Ŷᜢᜦee;抽edĀ;gᜬᜭ挅e»ᜭrkĀ;t፜᜷brk;掶Āoyᜁᝁ;䐱quo;怞ʀcmprtᝓ᝛ᝡᝤᝨausĀ;eĊĉptyv;榰séᜌnoõēƀahwᝯ᝱ᝳ;䎲;愶een;扬r;쀀𝔟g΀costuvwឍឝឳេ៕៛៞ƀaiuបពរðݠrc;旯p»፱ƀdptឤឨឭot;樀lus;樁imes;樂ɱឹ\0\0ើcup;樆ar;昅riangleĀdu៍្own;施p;斳plus;樄eåᑄåᒭarow;植ƀako៭ᠦᠵĀcn៲ᠣkƀlst៺֫᠂ozenge;槫riangleȀ;dlr᠒᠓᠘᠝斴own;斾eft;旂ight;斸k;搣Ʊᠫ\0ᠳƲᠯ\0ᠱ;斒;斑4;斓ck;斈ĀeoᠾᡍĀ;qᡃᡆ쀀=⃥uiv;쀀≡⃥t;挐Ȁptwxᡙᡞᡧᡬf;쀀𝕓Ā;tᏋᡣom»Ꮜtie;拈؀DHUVbdhmptuvᢅᢖᢪᢻᣗᣛᣬ᣿ᤅᤊᤐᤡȀLRlrᢎᢐᢒᢔ;敗;敔;敖;敓ʀ;DUduᢡᢢᢤᢦᢨ敐;敦;敩;敤;敧ȀLRlrᢳᢵᢷᢹ;敝;敚;敜;教΀;HLRhlrᣊᣋᣍᣏᣑᣓᣕ救;敬;散;敠;敫;敢;敟ox;槉ȀLRlrᣤᣦᣨᣪ;敕;敒;攐;攌ʀ;DUduڽ᣷᣹᣻᣽;敥;敨;攬;攴inus;抟lus;択imes;抠ȀLRlrᤙᤛᤝ᤟;敛;敘;攘;攔΀;HLRhlrᤰᤱᤳᤵᤷ᤻᤹攂;敪;敡;敞;攼;攤;攜Āevģ᥂bar耻¦䂦Ȁceioᥑᥖᥚᥠr;쀀𝒷mi;恏mĀ;e᜚᜜lƀ;bhᥨᥩᥫ䁜;槅sub;柈Ŭᥴ᥾lĀ;e᥹᥺怢t»᥺pƀ;Eeįᦅᦇ;檮Ā;qۜۛೡᦧ\0᧨ᨑᨕᨲ\0ᨷᩐ\0\0᪴\0\0᫁\0\0ᬡᬮ᭍᭒\0᯽\0ᰌƀcpr᦭ᦲ᧝ute;䄇̀;abcdsᦿᧀᧄ᧊᧕᧙戩nd;橄rcup;橉Āau᧏᧒p;橋p;橇ot;橀;쀀∩︀Āeo᧢᧥t;恁îړȀaeiu᧰᧻ᨁᨅǰ᧵\0᧸s;橍on;䄍dil耻ç䃧rc;䄉psĀ;sᨌᨍ橌m;橐ot;䄋ƀdmnᨛᨠᨦil肻¸ƭptyv;榲t脀¢;eᨭᨮ䂢räƲr;쀀𝔠ƀceiᨽᩀᩍy;䑇ckĀ;mᩇᩈ朓ark»ᩈ;䏇r΀;Ecefms᩟᩠ᩢᩫ᪤᪪᪮旋;槃ƀ;elᩩᩪᩭ䋆q;扗eɡᩴ\0\0᪈rrowĀlr᩼᪁eft;憺ight;憻ʀRSacd᪒᪔᪖᪚᪟»ཇ;擈st;抛irc;抚ash;抝nint;樐id;櫯cir;槂ubsĀ;u᪻᪼晣it»᪼ˬ᫇᫔᫺\0ᬊonĀ;eᫍᫎ䀺Ā;qÇÆɭ᫙\0\0᫢aĀ;t᫞᫟䀬;䁀ƀ;fl᫨᫩᫫戁îᅠeĀmx᫱᫶ent»᫩eóɍǧ᫾\0ᬇĀ;dኻᬂot;橭nôɆƀfryᬐᬔᬗ;쀀𝕔oäɔ脀©;sŕᬝr;愗Āaoᬥᬩrr;憵ss;朗Ācuᬲᬷr;쀀𝒸Ābpᬼ᭄Ā;eᭁᭂ櫏;櫑Ā;eᭉᭊ櫐;櫒dot;拯΀delprvw᭠᭬᭷ᮂᮬᯔ᯹arrĀlr᭨᭪;椸;椵ɰ᭲\0\0᭵r;拞c;拟arrĀ;p᭿ᮀ憶;椽̀;bcdosᮏᮐᮖᮡᮥᮨ截rcap;橈Āauᮛᮞp;橆p;橊ot;抍r;橅;쀀∪︀Ȁalrv᮵ᮿᯞᯣrrĀ;mᮼᮽ憷;椼yƀevwᯇᯔᯘqɰᯎ\0\0ᯒreã᭳uã᭵ee;拎edge;拏en耻¤䂤earrowĀlrᯮ᯳eft»ᮀight»ᮽeäᯝĀciᰁᰇoninôǷnt;戱lcty;挭ঀAHabcdefhijlorstuwz᰸᰻᰿ᱝᱩᱵᲊᲞᲬᲷ᳻᳿ᴍᵻᶑᶫᶻ᷆᷍rò΁ar;楥Ȁglrs᱈ᱍ᱒᱔ger;怠eth;愸òᄳhĀ;vᱚᱛ怐»ऊūᱡᱧarow;椏aã̕Āayᱮᱳron;䄏;䐴ƀ;ao̲ᱼᲄĀgrʿᲁr;懊tseq;橷ƀglmᲑᲔᲘ耻°䂰ta;䎴ptyv;榱ĀirᲣᲨsht;楿;쀀𝔡arĀlrᲳᲵ»ࣜ»သʀaegsv᳂͸᳖᳜᳠mƀ;oș᳊᳔ndĀ;ș᳑uit;晦amma;䏝in;拲ƀ;io᳧᳨᳸䃷de脀÷;o᳧ᳰntimes;拇nø᳷cy;䑒cɯᴆ\0\0ᴊrn;挞op;挍ʀlptuwᴘᴝᴢᵉᵕlar;䀤f;쀀𝕕ʀ;emps̋ᴭᴷᴽᵂqĀ;d͒ᴳot;扑inus;戸lus;戔quare;抡blebarwedgåúnƀadhᄮᵝᵧownarrowóᲃarpoonĀlrᵲᵶefôᲴighôᲶŢᵿᶅkaro÷གɯᶊ\0\0ᶎrn;挟op;挌ƀcotᶘᶣᶦĀryᶝᶡ;쀀𝒹;䑕l;槶rok;䄑Ādrᶰᶴot;拱iĀ;fᶺ᠖斿Āah᷀᷃ròЩaòྦangle;榦Āci᷒ᷕy;䑟grarr;柿ऀDacdefglmnopqrstuxḁḉḙḸոḼṉṡṾấắẽỡἪἷὄ὎὚ĀDoḆᴴoôᲉĀcsḎḔute耻é䃩ter;橮ȀaioyḢḧḱḶron;䄛rĀ;cḭḮ扖耻ê䃪lon;払;䑍ot;䄗ĀDrṁṅot;扒;쀀𝔢ƀ;rsṐṑṗ檚ave耻è䃨Ā;dṜṝ檖ot;檘Ȁ;ilsṪṫṲṴ檙nters;揧;愓Ā;dṹṺ檕ot;檗ƀapsẅẉẗcr;䄓tyƀ;svẒẓẕ戅et»ẓpĀ1;ẝẤĳạả;怄;怅怃ĀgsẪẬ;䅋p;怂ĀgpẴẸon;䄙f;쀀𝕖ƀalsỄỎỒrĀ;sỊị拕l;槣us;橱iƀ;lvỚớở䎵on»ớ;䏵ȀcsuvỪỳἋἣĀioữḱrc»Ḯɩỹ\0\0ỻíՈantĀglἂἆtr»ṝess»Ṻƀaeiἒ἖Ἒls;䀽st;扟vĀ;DȵἠD;橸parsl;槥ĀDaἯἳot;打rr;楱ƀcdiἾὁỸr;愯oô͒ĀahὉὋ;䎷耻ð䃰Āmrὓὗl耻ë䃫o;悬ƀcipὡὤὧl;䀡sôծĀeoὬὴctatioîՙnentialåչৡᾒ\0ᾞ\0ᾡᾧ\0\0ῆῌ\0ΐ\0ῦῪ \0 ⁚llingdotseñṄy;䑄male;晀ƀilrᾭᾳ῁lig;耀ﬃɩᾹ\0\0᾽g;耀ﬀig;耀ﬄ;쀀𝔣lig;耀ﬁlig;쀀fjƀaltῙ῜ῡt;晭ig;耀ﬂns;斱of;䆒ǰ΅\0ῳf;쀀𝕗ĀakֿῷĀ;vῼ´拔;櫙artint;樍Āao‌⁕Ācs‑⁒α‚‰‸⁅⁈\0⁐β•‥‧‪‬\0‮耻½䂽;慓耻¼䂼;慕;慙;慛Ƴ‴\0‶;慔;慖ʴ‾⁁\0\0⁃耻¾䂾;慗;慜5;慘ƶ⁌\0⁎;慚;慝8;慞l;恄wn;挢cr;쀀𝒻ࢀEabcdefgijlnorstv₂₉₟₥₰₴⃰⃵⃺⃿℃ℒℸ̗ℾ⅒↞Ā;lٍ₇;檌ƀcmpₐₕ₝ute;䇵maĀ;dₜ᳚䎳;檆reve;䄟Āiy₪₮rc;䄝;䐳ot;䄡Ȁ;lqsؾق₽⃉ƀ;qsؾٌ⃄lanô٥Ȁ;cdl٥⃒⃥⃕c;檩otĀ;o⃜⃝檀Ā;l⃢⃣檂;檄Ā;e⃪⃭쀀⋛︀s;檔r;쀀𝔤Ā;gٳ؛mel;愷cy;䑓Ȁ;Eajٚℌℎℐ;檒;檥;檤ȀEaesℛℝ℩ℴ;扩pĀ;p℣ℤ檊rox»ℤĀ;q℮ℯ檈Ā;q℮ℛim;拧pf;쀀𝕘Āci⅃ⅆr;愊mƀ;el٫ⅎ⅐;檎;檐茀>;cdlqr׮ⅠⅪⅮⅳⅹĀciⅥⅧ;檧r;橺ot;拗Par;榕uest;橼ʀadelsↄⅪ←ٖ↛ǰ↉\0↎proø₞r;楸qĀlqؿ↖lesó₈ií٫Āen↣↭rtneqq;쀀≩︀Å↪ԀAabcefkosy⇄⇇⇱⇵⇺∘∝∯≨≽ròΠȀilmr⇐⇔⇗⇛rsðᒄf»․ilôکĀdr⇠⇤cy;䑊ƀ;cwࣴ⇫⇯ir;楈;憭ar;意irc;䄥ƀalr∁∎∓rtsĀ;u∉∊晥it»∊lip;怦con;抹r;쀀𝔥sĀew∣∩arow;椥arow;椦ʀamopr∺∾≃≞≣rr;懿tht;戻kĀlr≉≓eftarrow;憩ightarrow;憪f;쀀𝕙bar;怕ƀclt≯≴≸r;쀀𝒽asè⇴rok;䄧Ābp⊂⊇ull;恃hen»ᱛૡ⊣\0⊪\0⊸⋅⋎\0⋕⋳\0\0⋸⌢⍧⍢⍿\0⎆⎪⎴cute耻í䃭ƀ;iyݱ⊰⊵rc耻î䃮;䐸Ācx⊼⊿y;䐵cl耻¡䂡ĀfrΟ⋉;쀀𝔦rave耻ì䃬Ȁ;inoܾ⋝⋩⋮Āin⋢⋦nt;樌t;戭fin;槜ta;愩lig;䄳ƀaop⋾⌚⌝ƀcgt⌅⌈⌗r;䄫ƀelpܟ⌏⌓inåގarôܠh;䄱f;抷ed;䆵ʀ;cfotӴ⌬⌱⌽⍁are;愅inĀ;t⌸⌹戞ie;槝doô⌙ʀ;celpݗ⍌⍐⍛⍡al;抺Āgr⍕⍙eróᕣã⍍arhk;樗rod;樼Ȁcgpt⍯⍲⍶⍻y;䑑on;䄯f;쀀𝕚a;䎹uest耻¿䂿Āci⎊⎏r;쀀𝒾nʀ;EdsvӴ⎛⎝⎡ӳ;拹ot;拵Ā;v⎦⎧拴;拳Ā;iݷ⎮lde;䄩ǫ⎸\0⎼cy;䑖l耻ï䃯̀cfmosu⏌⏗⏜⏡⏧⏵Āiy⏑⏕rc;䄵;䐹r;쀀𝔧ath;䈷pf;쀀𝕛ǣ⏬\0⏱r;쀀𝒿rcy;䑘kcy;䑔Ѐacfghjos␋␖␢␧␭␱␵␻ppaĀ;v␓␔䎺;䏰Āey␛␠dil;䄷;䐺r;쀀𝔨reen;䄸cy;䑅cy;䑜pf;쀀𝕜cr;쀀𝓀஀ABEHabcdefghjlmnoprstuv⑰⒁⒆⒍⒑┎┽╚▀♎♞♥♹♽⚚⚲⛘❝❨➋⟀⠁⠒ƀart⑷⑺⑼rò৆òΕail;椛arr;椎Ā;gঔ⒋;檋ar;楢ॣ⒥\0⒪\0⒱\0\0\0\0\0⒵Ⓔ\0ⓆⓈⓍ\0⓹ute;䄺mptyv;榴raîࡌbda;䎻gƀ;dlࢎⓁⓃ;榑åࢎ;檅uo耻«䂫rЀ;bfhlpst࢙ⓞⓦⓩ⓫⓮⓱⓵Ā;f࢝ⓣs;椟s;椝ë≒p;憫l;椹im;楳l;憢ƀ;ae⓿─┄檫il;椙Ā;s┉┊檭;쀀⪭︀ƀabr┕┙┝rr;椌rk;杲Āak┢┬cĀek┨┪;䁻;䁛Āes┱┳;榋lĀdu┹┻;榏;榍Ȁaeuy╆╋╖╘ron;䄾Ādi═╔il;䄼ìࢰâ┩;䐻Ȁcqrs╣╦╭╽a;椶uoĀ;rนᝆĀdu╲╷har;楧shar;楋h;憲ʀ;fgqs▋▌উ◳◿扤tʀahlrt▘▤▷◂◨rrowĀ;t࢙□aé⓶arpoonĀdu▯▴own»њp»०eftarrows;懇ightƀahs◍◖◞rrowĀ;sࣴࢧarpoonó྘quigarro÷⇰hreetimes;拋ƀ;qs▋ও◺lanôবʀ;cdgsব☊☍☝☨c;檨otĀ;o☔☕橿Ā;r☚☛檁;檃Ā;e☢☥쀀⋚︀s;檓ʀadegs☳☹☽♉♋pproøⓆot;拖qĀgq♃♅ôউgtò⒌ôছiíলƀilr♕࣡♚sht;楼;쀀𝔩Ā;Eজ♣;檑š♩♶rĀdu▲♮Ā;l॥♳;楪lk;斄cy;䑙ʀ;achtੈ⚈⚋⚑⚖rò◁orneòᴈard;楫ri;旺Āio⚟⚤dot;䅀ustĀ;a⚬⚭掰che»⚭ȀEaes⚻⚽⛉⛔;扨pĀ;p⛃⛄檉rox»⛄Ā;q⛎⛏檇Ā;q⛎⚻im;拦Ѐabnoptwz⛩⛴⛷✚✯❁❇❐Ānr⛮⛱g;柬r;懽rëࣁgƀlmr⛿✍✔eftĀar০✇ightá৲apsto;柼ightá৽parrowĀlr✥✩efô⓭ight;憬ƀafl✶✹✽r;榅;쀀𝕝us;樭imes;樴š❋❏st;戗áፎƀ;ef❗❘᠀旊nge»❘arĀ;l❤❥䀨t;榓ʀachmt❳❶❼➅➇ròࢨorneòᶌarĀ;d྘➃;業;怎ri;抿̀achiqt➘➝ੀ➢➮➻quo;怹r;쀀𝓁mƀ;egল➪➬;檍;檏Ābu┪➳oĀ;rฟ➹;怚rok;䅂萀<;cdhilqrࠫ⟒☹⟜⟠⟥⟪⟰Āci⟗⟙;檦r;橹reå◲mes;拉arr;楶uest;橻ĀPi⟵⟹ar;榖ƀ;ef⠀भ᠛旃rĀdu⠇⠍shar;楊har;楦Āen⠗⠡rtneqq;쀀≨︀Å⠞܀Dacdefhilnopsu⡀⡅⢂⢎⢓⢠⢥⢨⣚⣢⣤ઃ⣳⤂Dot;戺Ȁclpr⡎⡒⡣⡽r耻¯䂯Āet⡗⡙;時Ā;e⡞⡟朠se»⡟Ā;sျ⡨toȀ;dluျ⡳⡷⡻owîҌefôएðᏑker;斮Āoy⢇⢌mma;権;䐼ash;怔asuredangle»ᘦr;쀀𝔪o;愧ƀcdn⢯⢴⣉ro耻µ䂵Ȁ;acdᑤ⢽⣀⣄sôᚧir;櫰ot肻·Ƶusƀ;bd⣒ᤃ⣓戒Ā;uᴼ⣘;横ţ⣞⣡p;櫛ò−ðઁĀdp⣩⣮els;抧f;쀀𝕞Āct⣸⣽r;쀀𝓂pos»ᖝƀ;lm⤉⤊⤍䎼timap;抸ఀGLRVabcdefghijlmoprstuvw⥂⥓⥾⦉⦘⧚⧩⨕⨚⩘⩝⪃⪕⪤⪨⬄⬇⭄⭿⮮ⰴⱧⱼ⳩Āgt⥇⥋;쀀⋙̸Ā;v⥐௏쀀≫⃒ƀelt⥚⥲⥶ftĀar⥡⥧rrow;懍ightarrow;懎;쀀⋘̸Ā;v⥻ే쀀≪⃒ightarrow;懏ĀDd⦎⦓ash;抯ash;抮ʀbcnpt⦣⦧⦬⦱⧌la»˞ute;䅄g;쀀∠⃒ʀ;Eiop඄⦼⧀⧅⧈;쀀⩰̸d;쀀≋̸s;䅉roø඄urĀ;a⧓⧔普lĀ;s⧓ସǳ⧟\0⧣p肻 ଷmpĀ;e௹ఀʀaeouy⧴⧾⨃⨐⨓ǰ⧹\0⧻;橃on;䅈dil;䅆ngĀ;dൾ⨊ot;쀀⩭̸p;橂;䐽ash;怓΀;Aadqsxஒ⨩⨭⨻⩁⩅⩐rr;懗rĀhr⨳⨶k;椤Ā;oᏲᏰot;쀀≐̸uiöୣĀei⩊⩎ar;椨í஘istĀ;s஠டr;쀀𝔫ȀEest௅⩦⩹⩼ƀ;qs஼⩭௡ƀ;qs஼௅⩴lanô௢ií௪Ā;rஶ⪁»ஷƀAap⪊⪍⪑rò⥱rr;憮ar;櫲ƀ;svྍ⪜ྌĀ;d⪡⪢拼;拺cy;䑚΀AEadest⪷⪺⪾⫂⫅⫶⫹rò⥦;쀀≦̸rr;憚r;急Ȁ;fqs఻⫎⫣⫯tĀar⫔⫙rro÷⫁ightarro÷⪐ƀ;qs఻⪺⫪lanôౕĀ;sౕ⫴»శiíౝĀ;rవ⫾iĀ;eచథiäඐĀpt⬌⬑f;쀀𝕟膀¬;in⬙⬚⬶䂬nȀ;Edvஉ⬤⬨⬮;쀀⋹̸ot;쀀⋵̸ǡஉ⬳⬵;拷;拶iĀ;vಸ⬼ǡಸ⭁⭃;拾;拽ƀaor⭋⭣⭩rȀ;ast୻⭕⭚⭟lleì୻l;쀀⫽⃥;쀀∂̸lint;樔ƀ;ceಒ⭰⭳uåಥĀ;cಘ⭸Ā;eಒ⭽ñಘȀAait⮈⮋⮝⮧rò⦈rrƀ;cw⮔⮕⮙憛;쀀⤳̸;쀀↝̸ghtarrow»⮕riĀ;eೋೖ΀chimpqu⮽⯍⯙⬄୸⯤⯯Ȁ;cerല⯆ഷ⯉uå൅;쀀𝓃ortɭ⬅\0\0⯖ará⭖mĀ;e൮⯟Ā;q൴൳suĀbp⯫⯭å೸åഋƀbcp⯶ⰑⰙȀ;Ees⯿ⰀഢⰄ抄;쀀⫅̸etĀ;eഛⰋqĀ;qണⰀcĀ;eലⰗñസȀ;EesⰢⰣൟⰧ抅;쀀⫆̸etĀ;e൘ⰮqĀ;qൠⰣȀgilrⰽⰿⱅⱇìௗlde耻ñ䃱çృiangleĀlrⱒⱜeftĀ;eచⱚñదightĀ;eೋⱥñ೗Ā;mⱬⱭ䎽ƀ;esⱴⱵⱹ䀣ro;愖p;怇ҀDHadgilrsⲏⲔⲙⲞⲣⲰⲶⳓⳣash;抭arr;椄p;쀀≍⃒ash;抬ĀetⲨⲬ;쀀≥⃒;쀀>⃒nfin;槞ƀAetⲽⳁⳅrr;椂;쀀≤⃒Ā;rⳊⳍ쀀<⃒ie;쀀⊴⃒ĀAtⳘⳜrr;椃rie;쀀⊵⃒im;쀀∼⃒ƀAan⳰⳴ⴂrr;懖rĀhr⳺⳽k;椣Ā;oᏧᏥear;椧ቓ᪕\0\0\0\0\0\0\0\0\0\0\0\0\0ⴭ\0ⴸⵈⵠⵥ⵲ⶄᬇ\0\0ⶍⶫ\0ⷈⷎ\0ⷜ⸙⸫⸾⹃Ācsⴱ᪗ute耻ó䃳ĀiyⴼⵅrĀ;c᪞ⵂ耻ô䃴;䐾ʀabios᪠ⵒⵗǈⵚlac;䅑v;樸old;榼lig;䅓Ācr⵩⵭ir;榿;쀀𝔬ͯ⵹\0\0⵼\0ⶂn;䋛ave耻ò䃲;槁Ābmⶈ෴ar;榵Ȁacitⶕ⶘ⶥⶨrò᪀Āir⶝ⶠr;榾oss;榻nå๒;槀ƀaeiⶱⶵⶹcr;䅍ga;䏉ƀcdnⷀⷅǍron;䎿;榶pf;쀀𝕠ƀaelⷔ⷗ǒr;榷rp;榹΀;adiosvⷪⷫⷮ⸈⸍⸐⸖戨rò᪆Ȁ;efmⷷⷸ⸂⸅橝rĀ;oⷾⷿ愴f»ⷿ耻ª䂪耻º䂺gof;抶r;橖lope;橗;橛ƀclo⸟⸡⸧ò⸁ash耻ø䃸l;折iŬⸯ⸴de耻õ䃵esĀ;aǛ⸺s;樶ml耻ö䃶bar;挽ૡ⹞\0⹽\0⺀⺝\0⺢⺹\0\0⻋ຜ\0⼓\0\0⼫⾼\0⿈rȀ;astЃ⹧⹲຅脀¶;l⹭⹮䂶leìЃɩ⹸\0\0⹻m;櫳;櫽y;䐿rʀcimpt⺋⺏⺓ᡥ⺗nt;䀥od;䀮il;怰enk;怱r;쀀𝔭ƀimo⺨⺰⺴Ā;v⺭⺮䏆;䏕maô੶ne;明ƀ;tv⺿⻀⻈䏀chfork»´;䏖Āau⻏⻟nĀck⻕⻝kĀ;h⇴⻛;愎ö⇴sҀ;abcdemst⻳⻴ᤈ⻹⻽⼄⼆⼊⼎䀫cir;樣ir;樢Āouᵀ⼂;樥;橲n肻±ຝim;樦wo;樧ƀipu⼙⼠⼥ntint;樕f;쀀𝕡nd耻£䂣Ԁ;Eaceinosu່⼿⽁⽄⽇⾁⾉⾒⽾⾶;檳p;檷uå໙Ā;c໎⽌̀;acens່⽙⽟⽦⽨⽾pproø⽃urlyeñ໙ñ໎ƀaes⽯⽶⽺pprox;檹qq;檵im;拨iíໟmeĀ;s⾈ຮ怲ƀEas⽸⾐⽺ð⽵ƀdfp໬⾙⾯ƀals⾠⾥⾪lar;挮ine;挒urf;挓Ā;t໻⾴ï໻rel;抰Āci⿀⿅r;쀀𝓅;䏈ncsp;怈̀fiopsu⿚⋢⿟⿥⿫⿱r;쀀𝔮pf;쀀𝕢rime;恗cr;쀀𝓆ƀaeo⿸〉〓tĀei⿾々rnionóڰnt;樖stĀ;e【】䀿ñἙô༔઀ABHabcdefhilmnoprstux぀けさすムㄎㄫㅇㅢㅲㆎ㈆㈕㈤㈩㉘㉮㉲㊐㊰㊷ƀartぇおがròႳòϝail;検aròᱥar;楤΀cdenqrtとふへみわゔヌĀeuねぱ;쀀∽̱te;䅕iãᅮmptyv;榳gȀ;del࿑らるろ;榒;榥å࿑uo耻»䂻rր;abcfhlpstw࿜ガクシスゼゾダッデナp;極Ā;f࿠ゴs;椠;椳s;椞ë≝ð✮l;楅im;楴l;憣;憝Āaiパフil;椚oĀ;nホボ戶aló༞ƀabrョリヮrò៥rk;杳ĀakンヽcĀekヹ・;䁽;䁝Āes㄂㄄;榌lĀduㄊㄌ;榎;榐Ȁaeuyㄗㄜㄧㄩron;䅙Ādiㄡㄥil;䅗ì࿲âヺ;䑀Ȁclqsㄴㄷㄽㅄa;椷dhar;楩uoĀ;rȎȍh;憳ƀacgㅎㅟངlȀ;ipsླྀㅘㅛႜnåႻarôྩt;断ƀilrㅩဣㅮsht;楽;쀀𝔯ĀaoㅷㆆrĀduㅽㅿ»ѻĀ;l႑ㆄ;楬Ā;vㆋㆌ䏁;䏱ƀgns㆕ㇹㇼht̀ahlrstㆤㆰ㇂㇘㇤㇮rrowĀ;t࿜ㆭaéトarpoonĀduㆻㆿowîㅾp»႒eftĀah㇊㇐rrowó࿪arpoonóՑightarrows;應quigarro÷ニhreetimes;拌g;䋚ingdotseñἲƀahm㈍㈐㈓rò࿪aòՑ;怏oustĀ;a㈞㈟掱che»㈟mid;櫮Ȁabpt㈲㈽㉀㉒Ānr㈷㈺g;柭r;懾rëဃƀafl㉇㉊㉎r;榆;쀀𝕣us;樮imes;樵Āap㉝㉧rĀ;g㉣㉤䀩t;榔olint;樒arò㇣Ȁachq㉻㊀Ⴜ㊅quo;怺r;쀀𝓇Ābu・㊊oĀ;rȔȓƀhir㊗㊛㊠reåㇸmes;拊iȀ;efl㊪ၙᠡ㊫方tri;槎luhar;楨;愞ൡ㋕㋛㋟㌬㌸㍱\0㍺㎤\0\0㏬㏰\0㐨㑈㑚㒭㒱㓊㓱\0㘖\0\0㘳cute;䅛quï➺Ԁ;Eaceinpsyᇭ㋳㋵㋿㌂㌋㌏㌟㌦㌩;檴ǰ㋺\0㋼;檸on;䅡uåᇾĀ;dᇳ㌇il;䅟rc;䅝ƀEas㌖㌘㌛;檶p;檺im;择olint;樓iíሄ;䑁otƀ;be㌴ᵇ㌵担;橦΀Aacmstx㍆㍊㍗㍛㍞㍣㍭rr;懘rĀhr㍐㍒ë∨Ā;oਸ਼਴t耻§䂧i;䀻war;椩mĀin㍩ðnuóñt;朶rĀ;o㍶⁕쀀𝔰Ȁacoy㎂㎆㎑㎠rp;景Āhy㎋㎏cy;䑉;䑈rtɭ㎙\0\0㎜iäᑤaraì⹯耻­䂭Āgm㎨㎴maƀ;fv㎱㎲㎲䏃;䏂Ѐ;deglnprካ㏅㏉㏎㏖㏞㏡㏦ot;橪Ā;q኱ኰĀ;E㏓㏔檞;檠Ā;E㏛㏜檝;檟e;扆lus;樤arr;楲aròᄽȀaeit㏸㐈㐏㐗Āls㏽㐄lsetmé㍪hp;樳parsl;槤Ādlᑣ㐔e;挣Ā;e㐜㐝檪Ā;s㐢㐣檬;쀀⪬︀ƀflp㐮㐳㑂tcy;䑌Ā;b㐸㐹䀯Ā;a㐾㐿槄r;挿f;쀀𝕤aĀdr㑍ЂesĀ;u㑔㑕晠it»㑕ƀcsu㑠㑹㒟Āau㑥㑯pĀ;sᆈ㑫;쀀⊓︀pĀ;sᆴ㑵;쀀⊔︀uĀbp㑿㒏ƀ;esᆗᆜ㒆etĀ;eᆗ㒍ñᆝƀ;esᆨᆭ㒖etĀ;eᆨ㒝ñᆮƀ;afᅻ㒦ְrť㒫ֱ»ᅼaròᅈȀcemt㒹㒾㓂㓅r;쀀𝓈tmîñiì㐕aræᆾĀar㓎㓕rĀ;f㓔ឿ昆Āan㓚㓭ightĀep㓣㓪psiloîỠhé⺯s»⡒ʀbcmnp㓻㕞ሉ㖋㖎Ҁ;Edemnprs㔎㔏㔑㔕㔞㔣㔬㔱㔶抂;櫅ot;檽Ā;dᇚ㔚ot;櫃ult;櫁ĀEe㔨㔪;櫋;把lus;檿arr;楹ƀeiu㔽㕒㕕tƀ;en㔎㕅㕋qĀ;qᇚ㔏eqĀ;q㔫㔨m;櫇Ābp㕚㕜;櫕;櫓c̀;acensᇭ㕬㕲㕹㕻㌦pproø㋺urlyeñᇾñᇳƀaes㖂㖈㌛pproø㌚qñ㌗g;晪ڀ123;Edehlmnps㖩㖬㖯ሜ㖲㖴㗀㗉㗕㗚㗟㗨㗭耻¹䂹耻²䂲耻³䂳;櫆Āos㖹㖼t;檾ub;櫘Ā;dሢ㗅ot;櫄sĀou㗏㗒l;柉b;櫗arr;楻ult;櫂ĀEe㗤㗦;櫌;抋lus;櫀ƀeiu㗴㘉㘌tƀ;enሜ㗼㘂qĀ;qሢ㖲eqĀ;q㗧㗤m;櫈Ābp㘑㘓;櫔;櫖ƀAan㘜㘠㘭rr;懙rĀhr㘦㘨ë∮Ā;oਫ਩war;椪lig耻ß䃟௡㙑㙝㙠ዎ㙳㙹\0㙾㛂\0\0\0\0\0㛛㜃\0㜉㝬\0\0\0㞇ɲ㙖\0\0㙛get;挖;䏄rë๟ƀaey㙦㙫㙰ron;䅥dil;䅣;䑂lrec;挕r;쀀𝔱Ȁeiko㚆㚝㚵㚼ǲ㚋\0㚑eĀ4fኄኁaƀ;sv㚘㚙㚛䎸ym;䏑Ācn㚢㚲kĀas㚨㚮pproø዁im»ኬsðኞĀas㚺㚮ð዁rn耻þ䃾Ǭ̟㛆⋧es膀×;bd㛏㛐㛘䃗Ā;aᤏ㛕r;樱;樰ƀeps㛡㛣㜀á⩍Ȁ;bcf҆㛬㛰㛴ot;挶ir;櫱Ā;o㛹㛼쀀𝕥rk;櫚á㍢rime;怴ƀaip㜏㜒㝤dåቈ΀adempst㜡㝍㝀㝑㝗㝜㝟ngleʀ;dlqr㜰㜱㜶㝀㝂斵own»ᶻeftĀ;e⠀㜾ñम;扜ightĀ;e㊪㝋ñၚot;旬inus;樺lus;樹b;槍ime;樻ezium;揢ƀcht㝲㝽㞁Āry㝷㝻;쀀𝓉;䑆cy;䑛rok;䅧Āio㞋㞎xô᝷headĀlr㞗㞠eftarro÷ࡏightarrow»ཝऀAHabcdfghlmoprstuw㟐㟓㟗㟤㟰㟼㠎㠜㠣㠴㡑㡝㡫㢩㣌㣒㣪㣶ròϭar;楣Ācr㟜㟢ute耻ú䃺òᅐrǣ㟪\0㟭y;䑞ve;䅭Āiy㟵㟺rc耻û䃻;䑃ƀabh㠃㠆㠋ròᎭlac;䅱aòᏃĀir㠓㠘sht;楾;쀀𝔲rave耻ù䃹š㠧㠱rĀlr㠬㠮»ॗ»ႃlk;斀Āct㠹㡍ɯ㠿\0\0㡊rnĀ;e㡅㡆挜r»㡆op;挏ri;旸Āal㡖㡚cr;䅫肻¨͉Āgp㡢㡦on;䅳f;쀀𝕦̀adhlsuᅋ㡸㡽፲㢑㢠ownáᎳarpoonĀlr㢈㢌efô㠭ighô㠯iƀ;hl㢙㢚㢜䏅»ᏺon»㢚parrows;懈ƀcit㢰㣄㣈ɯ㢶\0\0㣁rnĀ;e㢼㢽挝r»㢽op;挎ng;䅯ri;旹cr;쀀𝓊ƀdir㣙㣝㣢ot;拰lde;䅩iĀ;f㜰㣨»᠓Āam㣯㣲rò㢨l耻ü䃼angle;榧ހABDacdeflnoprsz㤜㤟㤩㤭㦵㦸㦽㧟㧤㧨㧳㧹㧽㨁㨠ròϷarĀ;v㤦㤧櫨;櫩asèϡĀnr㤲㤷grt;榜΀eknprst㓣㥆㥋㥒㥝㥤㦖appá␕othinçẖƀhir㓫⻈㥙opô⾵Ā;hᎷ㥢ïㆍĀiu㥩㥭gmá㎳Ābp㥲㦄setneqĀ;q㥽㦀쀀⊊︀;쀀⫋︀setneqĀ;q㦏㦒쀀⊋︀;쀀⫌︀Āhr㦛㦟etá㚜iangleĀlr㦪㦯eft»थight»ၑy;䐲ash»ံƀelr㧄㧒㧗ƀ;beⷪ㧋㧏ar;抻q;扚lip;拮Ābt㧜ᑨaòᑩr;쀀𝔳tré㦮suĀbp㧯㧱»ജ»൙pf;쀀𝕧roð໻tré㦴Ācu㨆㨋r;쀀𝓋Ābp㨐㨘nĀEe㦀㨖»㥾nĀEe㦒㨞»㦐igzag;榚΀cefoprs㨶㨻㩖㩛㩔㩡㩪irc;䅵Ādi㩀㩑Ābg㩅㩉ar;機eĀ;qᗺ㩏;扙erp;愘r;쀀𝔴pf;쀀𝕨Ā;eᑹ㩦atèᑹcr;쀀𝓌ૣណ㪇\0㪋\0㪐㪛\0\0㪝㪨㪫㪯\0\0㫃㫎\0㫘ៜ៟tré៑r;쀀𝔵ĀAa㪔㪗ròσrò৶;䎾ĀAa㪡㪤ròθrò৫að✓is;拻ƀdptឤ㪵㪾Āfl㪺ឩ;쀀𝕩imåឲĀAa㫇㫊ròώròਁĀcq㫒ីr;쀀𝓍Āpt៖㫜ré។Ѐacefiosu㫰㫽㬈㬌㬑㬕㬛㬡cĀuy㫶㫻te耻ý䃽;䑏Āiy㬂㬆rc;䅷;䑋n耻¥䂥r;쀀𝔶cy;䑗pf;쀀𝕪cr;쀀𝓎Ācm㬦㬩y;䑎l耻ÿ䃿Ԁacdefhiosw㭂㭈㭔㭘㭤㭩㭭㭴㭺㮀cute;䅺Āay㭍㭒ron;䅾;䐷ot;䅼Āet㭝㭡træᕟa;䎶r;쀀𝔷cy;䐶grarr;懝pf;쀀𝕫cr;쀀𝓏Ājn㮅㮇;怍j;怌'.split("").map(e=>e.charCodeAt(0))),mb=new Uint16Array("Ȁaglq	\x1Bɭ\0\0p;䀦os;䀧t;䀾t;䀼uot;䀢".split("").map(e=>e.charCodeAt(0)));var Gi;const bb=new Map([[0,65533],[128,8364],[130,8218],[131,402],[132,8222],[133,8230],[134,8224],[135,8225],[136,710],[137,8240],[138,352],[139,8249],[140,338],[142,381],[145,8216],[146,8217],[147,8220],[148,8221],[149,8226],[150,8211],[151,8212],[152,732],[153,8482],[154,353],[155,8250],[156,339],[158,382],[159,376]]),gb=(Gi=String.fromCodePoint)!==null&&Gi!==void 0?Gi:function(e){let t="";return e>65535&&(e-=65536,t+=String.fromCharCode(e>>>10&1023|55296),e=56320|e&1023),t+=String.fromCharCode(e),t};function _b(e){var t;return e>=55296&&e<=57343||e>1114111?65533:(t=bb.get(e))!==null&&t!==void 0?t:e}var Te;(function(e){e[e.NUM=35]="NUM",e[e.SEMI=59]="SEMI",e[e.EQUALS=61]="EQUALS",e[e.ZERO=48]="ZERO",e[e.NINE=57]="NINE",e[e.LOWER_A=97]="LOWER_A",e[e.LOWER_F=102]="LOWER_F",e[e.LOWER_X=120]="LOWER_X",e[e.LOWER_Z=122]="LOWER_Z",e[e.UPPER_A=65]="UPPER_A",e[e.UPPER_F=70]="UPPER_F",e[e.UPPER_Z=90]="UPPER_Z"})(Te||(Te={}));const yb=32;var eu;(function(e){e[e.VALUE_LENGTH=49152]="VALUE_LENGTH",e[e.BRANCH_LENGTH=16256]="BRANCH_LENGTH",e[e.JUMP_TABLE=127]="JUMP_TABLE"})(eu||(eu={}));function Is(e){return e>=Te.ZERO&&e<=Te.NINE}function xb(e){return e>=Te.UPPER_A&&e<=Te.UPPER_F||e>=Te.LOWER_A&&e<=Te.LOWER_F}function Eb(e){return e>=Te.UPPER_A&&e<=Te.UPPER_Z||e>=Te.LOWER_A&&e<=Te.LOWER_Z||Is(e)}function Cb(e){return e===Te.EQUALS||Eb(e)}var De;(function(e){e[e.EntityStart=0]="EntityStart",e[e.NumericStart=1]="NumericStart",e[e.NumericDecimal=2]="NumericDecimal",e[e.NumericHex=3]="NumericHex",e[e.NamedEntity=4]="NamedEntity"})(De||(De={}));var Qt;(function(e){e[e.Legacy=0]="Legacy",e[e.Strict=1]="Strict",e[e.Attribute=2]="Attribute"})(Qt||(Qt={}));class kb{constructor(t,u,n){this.decodeTree=t,this.emitCodePoint=u,this.errors=n,this.state=De.EntityStart,this.consumed=1,this.result=0,this.treeIndex=0,this.excess=1,this.decodeMode=Qt.Strict}startEntity(t){this.decodeMode=t,this.state=De.EntityStart,this.result=0,this.treeIndex=0,this.excess=1,this.consumed=1}write(t,u){switch(this.state){case De.EntityStart:return t.charCodeAt(u)===Te.NUM?(this.state=De.NumericStart,this.consumed+=1,this.stateNumericStart(t,u+1)):(this.state=De.NamedEntity,this.stateNamedEntity(t,u));case De.NumericStart:return this.stateNumericStart(t,u);case De.NumericDecimal:return this.stateNumericDecimal(t,u);case De.NumericHex:return this.stateNumericHex(t,u);case De.NamedEntity:return this.stateNamedEntity(t,u)}}stateNumericStart(t,u){return u>=t.length?-1:(t.charCodeAt(u)|yb)===Te.LOWER_X?(this.state=De.NumericHex,this.consumed+=1,this.stateNumericHex(t,u+1)):(this.state=De.NumericDecimal,this.stateNumericDecimal(t,u))}addToNumericResult(t,u,n,r){if(u!==n){const i=n-u;this.result=this.result*Math.pow(r,i)+parseInt(t.substr(u,i),r),this.consumed+=i}}stateNumericHex(t,u){const n=u;for(;u<t.length;){const r=t.charCodeAt(u);if(Is(r)||xb(r))u+=1;else return this.addToNumericResult(t,n,u,16),this.emitNumericEntity(r,3)}return this.addToNumericResult(t,n,u,16),-1}stateNumericDecimal(t,u){const n=u;for(;u<t.length;){const r=t.charCodeAt(u);if(Is(r))u+=1;else return this.addToNumericResult(t,n,u,10),this.emitNumericEntity(r,2)}return this.addToNumericResult(t,n,u,10),-1}emitNumericEntity(t,u){var n;if(this.consumed<=u)return(n=this.errors)===null||n===void 0||n.absenceOfDigitsInNumericCharacterReference(this.consumed),0;if(t===Te.SEMI)this.consumed+=1;else if(this.decodeMode===Qt.Strict)return 0;return this.emitCodePoint(_b(this.result),this.consumed),this.errors&&(t!==Te.SEMI&&this.errors.missingSemicolonAfterCharacterReference(),this.errors.validateNumericCharacterReference(this.result)),this.consumed}stateNamedEntity(t,u){const{decodeTree:n}=this;let r=n[this.treeIndex],i=(r&eu.VALUE_LENGTH)>>14;for(;u<t.length;u++,this.excess++){const s=t.charCodeAt(u);if(this.treeIndex=Ab(n,r,this.treeIndex+Math.max(1,i),s),this.treeIndex<0)return this.result===0||this.decodeMode===Qt.Attribute&&(i===0||Cb(s))?0:this.emitNotTerminatedNamedEntity();if(r=n[this.treeIndex],i=(r&eu.VALUE_LENGTH)>>14,i!==0){if(s===Te.SEMI)return this.emitNamedEntityData(this.treeIndex,i,this.consumed+this.excess);this.decodeMode!==Qt.Strict&&(this.result=this.treeIndex,this.consumed+=this.excess,this.excess=0)}}return-1}emitNotTerminatedNamedEntity(){var t;const{result:u,decodeTree:n}=this,r=(n[u]&eu.VALUE_LENGTH)>>14;return this.emitNamedEntityData(u,r,this.consumed),(t=this.errors)===null||t===void 0||t.missingSemicolonAfterCharacterReference(),this.consumed}emitNamedEntityData(t,u,n){const{decodeTree:r}=this;return this.emitCodePoint(u===1?r[t]&~eu.VALUE_LENGTH:r[t+1],n),u===3&&this.emitCodePoint(r[t+2],n),n}end(){var t;switch(this.state){case De.NamedEntity:return this.result!==0&&(this.decodeMode!==Qt.Attribute||this.result===this.treeIndex)?this.emitNotTerminatedNamedEntity():0;case De.NumericDecimal:return this.emitNumericEntity(0,2);case De.NumericHex:return this.emitNumericEntity(0,3);case De.NumericStart:return(t=this.errors)===null||t===void 0||t.absenceOfDigitsInNumericCharacterReference(this.consumed),0;case De.EntityStart:return 0}}}function gf(e){let t="";const u=new kb(e,n=>t+=gb(n));return function(r,i){let s=0,o=0;for(;(o=r.indexOf("&",o))>=0;){t+=r.slice(s,o),u.startEntity(i);const l=u.write(r,o+1);if(l<0){s=o+u.end();break}s=o+l,o=l===0?s+1:s}const c=t+r.slice(s);return t="",c}}function Ab(e,t,u,n){const r=(t&eu.BRANCH_LENGTH)>>7,i=t&eu.JUMP_TABLE;if(r===0)return i!==0&&n===i?u:-1;if(i){const c=n-i;return c<0||c>=r?-1:e[u+c]-1}let s=u,o=s+r-1;for(;s<=o;){const c=s+o>>>1,l=e[c];if(l<n)s=c+1;else if(l>n)o=c-1;else return e[c+r]}return-1}const Sb=gf(pb);gf(mb);function _f(e,t=Qt.Legacy){return Sb(e,t)}function vb(e){return Object.prototype.toString.call(e)}function Lo(e){return vb(e)==="[object String]"}const Db=Object.prototype.hasOwnProperty;function Tb(e,t){return Db.call(e,t)}function vi(e){return Array.prototype.slice.call(arguments,1).forEach(function(u){if(u){if(typeof u!="object")throw new TypeError(u+"must be object");Object.keys(u).forEach(function(n){e[n]=u[n]})}}),e}function yf(e,t,u){return[].concat(e.slice(0,t),u,e.slice(t+1))}function Bo(e){return!(e>=55296&&e<=57343||e>=64976&&e<=65007||(e&65535)===65535||(e&65535)===65534||e>=0&&e<=8||e===11||e>=14&&e<=31||e>=127&&e<=159||e>1114111)}function Xr(e){if(e>65535){e-=65536;const t=55296+(e>>10),u=56320+(e&1023);return String.fromCharCode(t,u)}return String.fromCharCode(e)}const xf=/\\([!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~])/g,wb=/&([a-z#][a-z0-9]{1,31});/gi,Fb=new RegExp(xf.source+"|"+wb.source,"gi"),Ib=/^#((?:x[a-f0-9]{1,8}|[0-9]{1,8}))$/i;function Ob(e,t){if(t.charCodeAt(0)===35&&Ib.test(t)){const n=t[1].toLowerCase()==="x"?parseInt(t.slice(2),16):parseInt(t.slice(1),10);return Bo(n)?Xr(n):e}const u=_f(e);return u!==e?u:e}function Nb(e){return e.indexOf("\\")<0?e:e.replace(xf,"$1")}function en(e){return e.indexOf("\\")<0&&e.indexOf("&")<0?e:e.replace(Fb,function(t,u,n){return u||Ob(t,n)})}const Rb=/[&<>"]/,Mb=/[&<>"]/g,Pb={"&":"&amp;","<":"&lt;",">":"&gt;",'"':"&quot;"};function Lb(e){return Pb[e]}function lu(e){return Rb.test(e)?e.replace(Mb,Lb):e}const Bb=/[.?*+^$[\]\\(){}|-]/g;function Vb(e){return e.replace(Bb,"\\$&")}function ae(e){switch(e){case 9:case 32:return!0}return!1}function Un(e){if(e>=8192&&e<=8202)return!0;switch(e){case 9:case 10:case 11:case 12:case 13:case 32:case 160:case 5760:case 8239:case 8287:case 12288:return!0}return!1}function qn(e){return Po.test(e)||mf.test(e)}function jn(e){switch(e){case 33:case 34:case 35:case 36:case 37:case 38:case 39:case 40:case 41:case 42:case 43:case 44:case 45:case 46:case 47:case 58:case 59:case 60:case 61:case 62:case 63:case 64:case 91:case 92:case 93:case 94:case 95:case 96:case 123:case 124:case 125:case 126:return!0;default:return!1}}function Di(e){return e=e.trim().replace(/\s+/g," "),"ẞ".toLowerCase()==="Ṿ"&&(e=e.replace(/ẞ/g,"ß")),e.toLowerCase().toUpperCase()}const $b={mdurl:fb,ucmicro:hb},Hb=Object.freeze(Object.defineProperty({__proto__:null,arrayReplaceAt:yf,assign:vi,escapeHtml:lu,escapeRE:Vb,fromCodePoint:Xr,has:Tb,isMdAsciiPunct:jn,isPunctChar:qn,isSpace:ae,isString:Lo,isValidEntityCode:Bo,isWhiteSpace:Un,lib:$b,normalizeReference:Di,unescapeAll:en,unescapeMd:Nb},Symbol.toStringTag,{value:"Module"}));function Ub(e,t,u){let n,r,i,s;const o=e.posMax,c=e.pos;for(e.pos=t+1,n=1;e.pos<o;){if(i=e.src.charCodeAt(e.pos),i===93&&(n--,n===0)){r=!0;break}if(s=e.pos,e.md.inline.skipToken(e),i===91){if(s===e.pos-1)n++;else if(u)return e.pos=c,-1}}let l=-1;return r&&(l=e.pos),e.pos=c,l}function qb(e,t,u){let n,r=t;const i={ok:!1,pos:0,str:""};if(e.charCodeAt(r)===60){for(r++;r<u;){if(n=e.charCodeAt(r),n===10||n===60)return i;if(n===62)return i.pos=r+1,i.str=en(e.slice(t+1,r)),i.ok=!0,i;if(n===92&&r+1<u){r+=2;continue}r++}return i}let s=0;for(;r<u&&(n=e.charCodeAt(r),!(n===32||n<32||n===127));){if(n===92&&r+1<u){if(e.charCodeAt(r+1)===32)break;r+=2;continue}if(n===40&&(s++,s>32))return i;if(n===41){if(s===0)break;s--}r++}return t===r||s!==0||(i.str=en(e.slice(t,r)),i.pos=r,i.ok=!0),i}function jb(e,t,u,n){let r,i=t;const s={ok:!1,can_continue:!1,pos:0,str:"",marker:0};if(n)s.str=n.str,s.marker=n.marker;else{if(i>=u)return s;let o=e.charCodeAt(i);if(o!==34&&o!==39&&o!==40)return s;t++,i++,o===40&&(o=41),s.marker=o}for(;i<u;){if(r=e.charCodeAt(i),r===s.marker)return s.pos=i+1,s.str+=en(e.slice(t,i)),s.ok=!0,s;if(r===40&&s.marker===41)return s;r===92&&i+1<u&&i++,i++}return s.can_continue=!0,s.str+=en(e.slice(t,i)),s}const zb=Object.freeze(Object.defineProperty({__proto__:null,parseLinkDestination:qb,parseLinkLabel:Ub,parseLinkTitle:jb},Symbol.toStringTag,{value:"Module"})),Dt={};Dt.code_inline=function(e,t,u,n,r){const i=e[t];return"<code"+r.renderAttrs(i)+">"+lu(i.content)+"</code>"};Dt.code_block=function(e,t,u,n,r){const i=e[t];return"<pre"+r.renderAttrs(i)+"><code>"+lu(e[t].content)+`</code></pre>
-`};Dt.fence=function(e,t,u,n,r){const i=e[t],s=i.info?en(i.info).trim():"";let o="",c="";if(s){const a=s.split(/(\s+)/g);o=a[0],c=a.slice(2).join("")}let l;if(u.highlight?l=u.highlight(i.content,o,c)||lu(i.content):l=lu(i.content),l.indexOf("<pre")===0)return l+`
-`;if(s){const a=i.attrIndex("class"),f=i.attrs?i.attrs.slice():[];a<0?f.push(["class",u.langPrefix+o]):(f[a]=f[a].slice(),f[a][1]+=" "+u.langPrefix+o);const p={attrs:f};return`<pre><code${r.renderAttrs(p)}>${l}</code></pre>
-`}return`<pre><code${r.renderAttrs(i)}>${l}</code></pre>
-`};Dt.image=function(e,t,u,n,r){const i=e[t];return i.attrs[i.attrIndex("alt")][1]=r.renderInlineAsText(i.children,u,n),r.renderToken(e,t,u)};Dt.hardbreak=function(e,t,u){return u.xhtmlOut?`<br />
-`:`<br>
-`};Dt.softbreak=function(e,t,u){return u.breaks?u.xhtmlOut?`<br />
-`:`<br>
-`:`
-`};Dt.text=function(e,t){return lu(e[t].content)};Dt.html_block=function(e,t){return e[t].content};Dt.html_inline=function(e,t){return e[t].content};function rn(){this.rules=vi({},Dt)}rn.prototype.renderAttrs=function(t){let u,n,r;if(!t.attrs)return"";for(r="",u=0,n=t.attrs.length;u<n;u++)r+=" "+lu(t.attrs[u][0])+'="'+lu(t.attrs[u][1])+'"';return r};rn.prototype.renderToken=function(t,u,n){const r=t[u];let i="";if(r.hidden)return"";r.block&&r.nesting!==-1&&u&&t[u-1].hidden&&(i+=`
-`),i+=(r.nesting===-1?"</":"<")+r.tag,i+=this.renderAttrs(r),r.nesting===0&&n.xhtmlOut&&(i+=" /");let s=!1;if(r.block&&(s=!0,r.nesting===1&&u+1<t.length)){const o=t[u+1];(o.type==="inline"||o.hidden||o.nesting===-1&&o.tag===r.tag)&&(s=!1)}return i+=s?`>
-`:">",i};rn.prototype.renderInline=function(e,t,u){let n="";const r=this.rules;for(let i=0,s=e.length;i<s;i++){const o=e[i].type;typeof r[o]<"u"?n+=r[o](e,i,t,u,this):n+=this.renderToken(e,i,t)}return n};rn.prototype.renderInlineAsText=function(e,t,u){let n="";for(let r=0,i=e.length;r<i;r++)switch(e[r].type){case"text":n+=e[r].content;break;case"image":n+=this.renderInlineAsText(e[r].children,t,u);break;case"html_inline":case"html_block":n+=e[r].content;break;case"softbreak":case"hardbreak":n+=`
-`;break}return n};rn.prototype.render=function(e,t,u){let n="";const r=this.rules;for(let i=0,s=e.length;i<s;i++){const o=e[i].type;o==="inline"?n+=this.renderInline(e[i].children,t,u):typeof r[o]<"u"?n+=r[o](e,i,t,u,this):n+=this.renderToken(e,i,t,u)}return n};function Je(){this.__rules__=[],this.__cache__=null}Je.prototype.__find__=function(e){for(let t=0;t<this.__rules__.length;t++)if(this.__rules__[t].name===e)return t;return-1};Je.prototype.__compile__=function(){const e=this,t=[""];e.__rules__.forEach(function(u){u.enabled&&u.alt.forEach(function(n){t.indexOf(n)<0&&t.push(n)})}),e.__cache__={},t.forEach(function(u){e.__cache__[u]=[],e.__rules__.forEach(function(n){n.enabled&&(u&&n.alt.indexOf(u)<0||e.__cache__[u].push(n.fn))})})};Je.prototype.at=function(e,t,u){const n=this.__find__(e),r=u||{};if(n===-1)throw new Error("Parser rule not found: "+e);this.__rules__[n].fn=t,this.__rules__[n].alt=r.alt||[],this.__cache__=null};Je.prototype.before=function(e,t,u,n){const r=this.__find__(e),i=n||{};if(r===-1)throw new Error("Parser rule not found: "+e);this.__rules__.splice(r,0,{name:t,enabled:!0,fn:u,alt:i.alt||[]}),this.__cache__=null};Je.prototype.after=function(e,t,u,n){const r=this.__find__(e),i=n||{};if(r===-1)throw new Error("Parser rule not found: "+e);this.__rules__.splice(r+1,0,{name:t,enabled:!0,fn:u,alt:i.alt||[]}),this.__cache__=null};Je.prototype.push=function(e,t,u){const n=u||{};this.__rules__.push({name:e,enabled:!0,fn:t,alt:n.alt||[]}),this.__cache__=null};Je.prototype.enable=function(e,t){Array.isArray(e)||(e=[e]);const u=[];return e.forEach(function(n){const r=this.__find__(n);if(r<0){if(t)return;throw new Error("Rules manager: invalid rule name "+n)}this.__rules__[r].enabled=!0,u.push(n)},this),this.__cache__=null,u};Je.prototype.enableOnly=function(e,t){Array.isArray(e)||(e=[e]),this.__rules__.forEach(function(u){u.enabled=!1}),this.enable(e,t)};Je.prototype.disable=function(e,t){Array.isArray(e)||(e=[e]);const u=[];return e.forEach(function(n){const r=this.__find__(n);if(r<0){if(t)return;throw new Error("Rules manager: invalid rule name "+n)}this.__rules__[r].enabled=!1,u.push(n)},this),this.__cache__=null,u};Je.prototype.getRules=function(e){return this.__cache__===null&&this.__compile__(),this.__cache__[e]||[]};function _t(e,t,u){this.type=e,this.tag=t,this.attrs=null,this.map=null,this.nesting=u,this.level=0,this.children=null,this.content="",this.markup="",this.info="",this.meta=null,this.block=!1,this.hidden=!1}_t.prototype.attrIndex=function(t){if(!this.attrs)return-1;const u=this.attrs;for(let n=0,r=u.length;n<r;n++)if(u[n][0]===t)return n;return-1};_t.prototype.attrPush=function(t){this.attrs?this.attrs.push(t):this.attrs=[t]};_t.prototype.attrSet=function(t,u){const n=this.attrIndex(t),r=[t,u];n<0?this.attrPush(r):this.attrs[n]=r};_t.prototype.attrGet=function(t){const u=this.attrIndex(t);let n=null;return u>=0&&(n=this.attrs[u][1]),n};_t.prototype.attrJoin=function(t,u){const n=this.attrIndex(t);n<0?this.attrPush([t,u]):this.attrs[n][1]=this.attrs[n][1]+" "+u};function Ef(e,t,u){this.src=e,this.env=u,this.tokens=[],this.inlineMode=!1,this.md=t}Ef.prototype.Token=_t;const Kb=/\r\n?|\n/g,Wb=/\0/g;function Gb(e){let t;t=e.src.replace(Kb,`
-`),t=t.replace(Wb,"�"),e.src=t}function Zb(e){let t;e.inlineMode?(t=new e.Token("inline","",0),t.content=e.src,t.map=[0,1],t.children=[],e.tokens.push(t)):e.md.block.parse(e.src,e.md,e.env,e.tokens)}function Jb(e){const t=e.tokens;for(let u=0,n=t.length;u<n;u++){const r=t[u];r.type==="inline"&&e.md.inline.parse(r.content,e.md,e.env,r.children)}}function Yb(e){return/^<a[>\s]/i.test(e)}function Xb(e){return/^<\/a\s*>/i.test(e)}function Qb(e){const t=e.tokens;if(e.md.options.linkify)for(let u=0,n=t.length;u<n;u++){if(t[u].type!=="inline"||!e.md.linkify.pretest(t[u].content))continue;let r=t[u].children,i=0;for(let s=r.length-1;s>=0;s--){const o=r[s];if(o.type==="link_close"){for(s--;r[s].level!==o.level&&r[s].type!=="link_open";)s--;continue}if(o.type==="html_inline"&&(Yb(o.content)&&i>0&&i--,Xb(o.content)&&i++),!(i>0)&&o.type==="text"&&e.md.linkify.test(o.content)){const c=o.content;let l=e.md.linkify.match(c);const a=[];let f=o.level,p=0;l.length>0&&l[0].index===0&&s>0&&r[s-1].type==="text_special"&&(l=l.slice(1));for(let d=0;d<l.length;d++){const h=l[d].url,_=e.md.normalizeLink(h);if(!e.md.validateLink(_))continue;let v=l[d].text;l[d].schema?l[d].schema==="mailto:"&&!/^mailto:/i.test(v)?v=e.md.normalizeLinkText("mailto:"+v).replace(/^mailto:/,""):v=e.md.normalizeLinkText(v):v=e.md.normalizeLinkText("http://"+v).replace(/^http:\/\//,"");const A=l[d].index;if(A>p){const x=new e.Token("text","",0);x.content=c.slice(p,A),x.level=f,a.push(x)}const E=new e.Token("link_open","a",1);E.attrs=[["href",_]],E.level=f++,E.markup="linkify",E.info="auto",a.push(E);const m=new e.Token("text","",0);m.content=v,m.level=f,a.push(m);const g=new e.Token("link_close","a",-1);g.level=--f,g.markup="linkify",g.info="auto",a.push(g),p=l[d].lastIndex}if(p<c.length){const d=new e.Token("text","",0);d.content=c.slice(p),d.level=f,a.push(d)}t[u].children=r=yf(r,s,a)}}}}const Cf=/\+-|\.\.|\?\?\?\?|!!!!|,,|--/,eg=/\((c|tm|r)\)/i,tg=/\((c|tm|r)\)/ig,ug={c:"©",r:"®",tm:"™"};function ng(e,t){return ug[t.toLowerCase()]}function rg(e){let t=0;for(let u=e.length-1;u>=0;u--){const n=e[u];n.type==="text"&&!t&&(n.content=n.content.replace(tg,ng)),n.type==="link_open"&&n.info==="auto"&&t--,n.type==="link_close"&&n.info==="auto"&&t++}}function ig(e){let t=0;for(let u=e.length-1;u>=0;u--){const n=e[u];n.type==="text"&&!t&&Cf.test(n.content)&&(n.content=n.content.replace(/\+-/g,"±").replace(/\.{2,}/g,"…").replace(/([?!])…/g,"$1..").replace(/([?!]){4,}/g,"$1$1$1").replace(/,{2,}/g,",").replace(/(^|[^-])---(?=[^-]|$)/mg,"$1—").replace(/(^|\s)--(?=\s|$)/mg,"$1–").replace(/(^|[^-\s])--(?=[^-\s]|$)/mg,"$1–")),n.type==="link_open"&&n.info==="auto"&&t--,n.type==="link_close"&&n.info==="auto"&&t++}}function sg(e){let t;if(e.md.options.typographer)for(t=e.tokens.length-1;t>=0;t--)e.tokens[t].type==="inline"&&(eg.test(e.tokens[t].content)&&rg(e.tokens[t].children),Cf.test(e.tokens[t].content)&&ig(e.tokens[t].children))}const og=/['"]/,cl=/['"]/g,ll="’";function mr(e,t,u){return e.slice(0,t)+u+e.slice(t+1)}function cg(e,t){let u;const n=[];for(let r=0;r<e.length;r++){const i=e[r],s=e[r].level;for(u=n.length-1;u>=0&&!(n[u].level<=s);u--);if(n.length=u+1,i.type!=="text")continue;let o=i.content,c=0,l=o.length;e:for(;c<l;){cl.lastIndex=c;const a=cl.exec(o);if(!a)break;let f=!0,p=!0;c=a.index+1;const d=a[0]==="'";let h=32;if(a.index-1>=0)h=o.charCodeAt(a.index-1);else for(u=r-1;u>=0&&!(e[u].type==="softbreak"||e[u].type==="hardbreak");u--)if(e[u].content){h=e[u].content.charCodeAt(e[u].content.length-1);break}let _=32;if(c<l)_=o.charCodeAt(c);else for(u=r+1;u<e.length&&!(e[u].type==="softbreak"||e[u].type==="hardbreak");u++)if(e[u].content){_=e[u].content.charCodeAt(0);break}const v=jn(h)||qn(String.fromCharCode(h)),A=jn(_)||qn(String.fromCharCode(_)),E=Un(h),m=Un(_);if(m?f=!1:A&&(E||v||(f=!1)),E?p=!1:v&&(m||A||(p=!1)),_===34&&a[0]==='"'&&h>=48&&h<=57&&(p=f=!1),f&&p&&(f=v,p=A),!f&&!p){d&&(i.content=mr(i.content,a.index,ll));continue}if(p)for(u=n.length-1;u>=0;u--){let g=n[u];if(n[u].level<s)break;if(g.single===d&&n[u].level===s){g=n[u];let x,k;d?(x=t.md.options.quotes[2],k=t.md.options.quotes[3]):(x=t.md.options.quotes[0],k=t.md.options.quotes[1]),i.content=mr(i.content,a.index,k),e[g.token].content=mr(e[g.token].content,g.pos,x),c+=k.length-1,g.token===r&&(c+=x.length-1),o=i.content,l=o.length,n.length=u;continue e}}f?n.push({token:r,pos:a.index,single:d,level:s}):p&&d&&(i.content=mr(i.content,a.index,ll))}}}function lg(e){if(e.md.options.typographer)for(let t=e.tokens.length-1;t>=0;t--)e.tokens[t].type!=="inline"||!og.test(e.tokens[t].content)||cg(e.tokens[t].children,e)}function ag(e){let t,u;const n=e.tokens,r=n.length;for(let i=0;i<r;i++){if(n[i].type!=="inline")continue;const s=n[i].children,o=s.length;for(t=0;t<o;t++)s[t].type==="text_special"&&(s[t].type="text");for(t=u=0;t<o;t++)s[t].type==="text"&&t+1<o&&s[t+1].type==="text"?s[t+1].content=s[t].content+s[t+1].content:(t!==u&&(s[u]=s[t]),u++);t!==u&&(s.length=u)}}const Zi=[["normalize",Gb],["block",Zb],["inline",Jb],["linkify",Qb],["replacements",sg],["smartquotes",lg],["text_join",ag]];function Vo(){this.ruler=new Je;for(let e=0;e<Zi.length;e++)this.ruler.push(Zi[e][0],Zi[e][1])}Vo.prototype.process=function(e){const t=this.ruler.getRules("");for(let u=0,n=t.length;u<n;u++)t[u](e)};Vo.prototype.State=Ef;function Tt(e,t,u,n){this.src=e,this.md=t,this.env=u,this.tokens=n,this.bMarks=[],this.eMarks=[],this.tShift=[],this.sCount=[],this.bsCount=[],this.blkIndent=0,this.line=0,this.lineMax=0,this.tight=!1,this.ddIndent=-1,this.listIndent=-1,this.parentType="root",this.level=0;const r=this.src;for(let i=0,s=0,o=0,c=0,l=r.length,a=!1;s<l;s++){const f=r.charCodeAt(s);if(!a)if(ae(f)){o++,f===9?c+=4-c%4:c++;continue}else a=!0;(f===10||s===l-1)&&(f!==10&&s++,this.bMarks.push(i),this.eMarks.push(s),this.tShift.push(o),this.sCount.push(c),this.bsCount.push(0),a=!1,o=0,c=0,i=s+1)}this.bMarks.push(r.length),this.eMarks.push(r.length),this.tShift.push(0),this.sCount.push(0),this.bsCount.push(0),this.lineMax=this.bMarks.length-1}Tt.prototype.push=function(e,t,u){const n=new _t(e,t,u);return n.block=!0,u<0&&this.level--,n.level=this.level,u>0&&this.level++,this.tokens.push(n),n};Tt.prototype.isEmpty=function(t){return this.bMarks[t]+this.tShift[t]>=this.eMarks[t]};Tt.prototype.skipEmptyLines=function(t){for(let u=this.lineMax;t<u&&!(this.bMarks[t]+this.tShift[t]<this.eMarks[t]);t++);return t};Tt.prototype.skipSpaces=function(t){for(let u=this.src.length;t<u;t++){const n=this.src.charCodeAt(t);if(!ae(n))break}return t};Tt.prototype.skipSpacesBack=function(t,u){if(t<=u)return t;for(;t>u;)if(!ae(this.src.charCodeAt(--t)))return t+1;return t};Tt.prototype.skipChars=function(t,u){for(let n=this.src.length;t<n&&this.src.charCodeAt(t)===u;t++);return t};Tt.prototype.skipCharsBack=function(t,u,n){if(t<=n)return t;for(;t>n;)if(u!==this.src.charCodeAt(--t))return t+1;return t};Tt.prototype.getLines=function(t,u,n,r){if(t>=u)return"";const i=new Array(u-t);for(let s=0,o=t;o<u;o++,s++){let c=0;const l=this.bMarks[o];let a=l,f;for(o+1<u||r?f=this.eMarks[o]+1:f=this.eMarks[o];a<f&&c<n;){const p=this.src.charCodeAt(a);if(ae(p))p===9?c+=4-(c+this.bsCount[o])%4:c++;else if(a-l<this.tShift[o])c++;else break;a++}c>n?i[s]=new Array(c-n+1).join(" ")+this.src.slice(a,f):i[s]=this.src.slice(a,f)}return i.join("")};Tt.prototype.Token=_t;const fg=65536;function Ji(e,t){const u=e.bMarks[t]+e.tShift[t],n=e.eMarks[t];return e.src.slice(u,n)}function al(e){const t=[],u=e.length;let n=0,r=e.charCodeAt(n),i=!1,s=0,o="";for(;n<u;)r===124&&(i?(o+=e.substring(s,n-1),s=n):(t.push(o+e.substring(s,n)),o="",s=n+1)),i=r===92,n++,r=e.charCodeAt(n);return t.push(o+e.substring(s)),t}function dg(e,t,u,n){if(t+2>u)return!1;let r=t+1;if(e.sCount[r]<e.blkIndent||e.sCount[r]-e.blkIndent>=4)return!1;let i=e.bMarks[r]+e.tShift[r];if(i>=e.eMarks[r])return!1;const s=e.src.charCodeAt(i++);if(s!==124&&s!==45&&s!==58||i>=e.eMarks[r])return!1;const o=e.src.charCodeAt(i++);if(o!==124&&o!==45&&o!==58&&!ae(o)||s===45&&ae(o))return!1;for(;i<e.eMarks[r];){const g=e.src.charCodeAt(i);if(g!==124&&g!==45&&g!==58&&!ae(g))return!1;i++}let c=Ji(e,t+1),l=c.split("|");const a=[];for(let g=0;g<l.length;g++){const x=l[g].trim();if(!x){if(g===0||g===l.length-1)continue;return!1}if(!/^:?-+:?$/.test(x))return!1;x.charCodeAt(x.length-1)===58?a.push(x.charCodeAt(0)===58?"center":"right"):x.charCodeAt(0)===58?a.push("left"):a.push("")}if(c=Ji(e,t).trim(),c.indexOf("|")===-1||e.sCount[t]-e.blkIndent>=4)return!1;l=al(c),l.length&&l[0]===""&&l.shift(),l.length&&l[l.length-1]===""&&l.pop();const f=l.length;if(f===0||f!==a.length)return!1;if(n)return!0;const p=e.parentType;e.parentType="table";const d=e.md.block.ruler.getRules("blockquote"),h=e.push("table_open","table",1),_=[t,0];h.map=_;const v=e.push("thead_open","thead",1);v.map=[t,t+1];const A=e.push("tr_open","tr",1);A.map=[t,t+1];for(let g=0;g<l.length;g++){const x=e.push("th_open","th",1);a[g]&&(x.attrs=[["style","text-align:"+a[g]]]);const k=e.push("inline","",0);k.content=l[g].trim(),k.children=[],e.push("th_close","th",-1)}e.push("tr_close","tr",-1),e.push("thead_close","thead",-1);let E,m=0;for(r=t+2;r<u&&!(e.sCount[r]<e.blkIndent);r++){let g=!1;for(let k=0,I=d.length;k<I;k++)if(d[k](e,r,u,!0)){g=!0;break}if(g||(c=Ji(e,r).trim(),!c)||e.sCount[r]-e.blkIndent>=4||(l=al(c),l.length&&l[0]===""&&l.shift(),l.length&&l[l.length-1]===""&&l.pop(),m+=f-l.length,m>fg))break;if(r===t+2){const k=e.push("tbody_open","tbody",1);k.map=E=[t+2,0]}const x=e.push("tr_open","tr",1);x.map=[r,r+1];for(let k=0;k<f;k++){const I=e.push("td_open","td",1);a[k]&&(I.attrs=[["style","text-align:"+a[k]]]);const w=e.push("inline","",0);w.content=l[k]?l[k].trim():"",w.children=[],e.push("td_close","td",-1)}e.push("tr_close","tr",-1)}return E&&(e.push("tbody_close","tbody",-1),E[1]=r),e.push("table_close","table",-1),_[1]=r,e.parentType=p,e.line=r,!0}function hg(e,t,u){if(e.sCount[t]-e.blkIndent<4)return!1;let n=t+1,r=n;for(;n<u;){if(e.isEmpty(n)){n++;continue}if(e.sCount[n]-e.blkIndent>=4){n++,r=n;continue}break}e.line=r;const i=e.push("code_block","code",0);return i.content=e.getLines(t,r,4+e.blkIndent,!1)+`
-`,i.map=[t,e.line],!0}function pg(e,t,u,n){let r=e.bMarks[t]+e.tShift[t],i=e.eMarks[t];if(e.sCount[t]-e.blkIndent>=4||r+3>i)return!1;const s=e.src.charCodeAt(r);if(s!==126&&s!==96)return!1;let o=r;r=e.skipChars(r,s);let c=r-o;if(c<3)return!1;const l=e.src.slice(o,r),a=e.src.slice(r,i);if(s===96&&a.indexOf(String.fromCharCode(s))>=0)return!1;if(n)return!0;let f=t,p=!1;for(;f++,!(f>=u||(r=o=e.bMarks[f]+e.tShift[f],i=e.eMarks[f],r<i&&e.sCount[f]<e.blkIndent));)if(e.src.charCodeAt(r)===s&&!(e.sCount[f]-e.blkIndent>=4)&&(r=e.skipChars(r,s),!(r-o<c)&&(r=e.skipSpaces(r),!(r<i)))){p=!0;break}c=e.sCount[t],e.line=f+(p?1:0);const d=e.push("fence","code",0);return d.info=a,d.content=e.getLines(t+1,f,c,!0),d.markup=l,d.map=[t,e.line],!0}function mg(e,t,u,n){let r=e.bMarks[t]+e.tShift[t],i=e.eMarks[t];const s=e.lineMax;if(e.sCount[t]-e.blkIndent>=4||e.src.charCodeAt(r)!==62)return!1;if(n)return!0;const o=[],c=[],l=[],a=[],f=e.md.block.ruler.getRules("blockquote"),p=e.parentType;e.parentType="blockquote";let d=!1,h;for(h=t;h<u;h++){const m=e.sCount[h]<e.blkIndent;if(r=e.bMarks[h]+e.tShift[h],i=e.eMarks[h],r>=i)break;if(e.src.charCodeAt(r++)===62&&!m){let x=e.sCount[h]+1,k,I;e.src.charCodeAt(r)===32?(r++,x++,I=!1,k=!0):e.src.charCodeAt(r)===9?(k=!0,(e.bsCount[h]+x)%4===3?(r++,x++,I=!1):I=!0):k=!1;let w=x;for(o.push(e.bMarks[h]),e.bMarks[h]=r;r<i;){const C=e.src.charCodeAt(r);if(ae(C))C===9?w+=4-(w+e.bsCount[h]+(I?1:0))%4:w++;else break;r++}d=r>=i,c.push(e.bsCount[h]),e.bsCount[h]=e.sCount[h]+1+(k?1:0),l.push(e.sCount[h]),e.sCount[h]=w-x,a.push(e.tShift[h]),e.tShift[h]=r-e.bMarks[h];continue}if(d)break;let g=!1;for(let x=0,k=f.length;x<k;x++)if(f[x](e,h,u,!0)){g=!0;break}if(g){e.lineMax=h,e.blkIndent!==0&&(o.push(e.bMarks[h]),c.push(e.bsCount[h]),a.push(e.tShift[h]),l.push(e.sCount[h]),e.sCount[h]-=e.blkIndent);break}o.push(e.bMarks[h]),c.push(e.bsCount[h]),a.push(e.tShift[h]),l.push(e.sCount[h]),e.sCount[h]=-1}const _=e.blkIndent;e.blkIndent=0;const v=e.push("blockquote_open","blockquote",1);v.markup=">";const A=[t,0];v.map=A,e.md.block.tokenize(e,t,h);const E=e.push("blockquote_close","blockquote",-1);E.markup=">",e.lineMax=s,e.parentType=p,A[1]=e.line;for(let m=0;m<a.length;m++)e.bMarks[m+t]=o[m],e.tShift[m+t]=a[m],e.sCount[m+t]=l[m],e.bsCount[m+t]=c[m];return e.blkIndent=_,!0}function bg(e,t,u,n){const r=e.eMarks[t];if(e.sCount[t]-e.blkIndent>=4)return!1;let i=e.bMarks[t]+e.tShift[t];const s=e.src.charCodeAt(i++);if(s!==42&&s!==45&&s!==95)return!1;let o=1;for(;i<r;){const l=e.src.charCodeAt(i++);if(l!==s&&!ae(l))return!1;l===s&&o++}if(o<3)return!1;if(n)return!0;e.line=t+1;const c=e.push("hr","hr",0);return c.map=[t,e.line],c.markup=Array(o+1).join(String.fromCharCode(s)),!0}function fl(e,t){const u=e.eMarks[t];let n=e.bMarks[t]+e.tShift[t];const r=e.src.charCodeAt(n++);if(r!==42&&r!==45&&r!==43)return-1;if(n<u){const i=e.src.charCodeAt(n);if(!ae(i))return-1}return n}function dl(e,t){const u=e.bMarks[t]+e.tShift[t],n=e.eMarks[t];let r=u;if(r+1>=n)return-1;let i=e.src.charCodeAt(r++);if(i<48||i>57)return-1;for(;;){if(r>=n)return-1;if(i=e.src.charCodeAt(r++),i>=48&&i<=57){if(r-u>=10)return-1;continue}if(i===41||i===46)break;return-1}return r<n&&(i=e.src.charCodeAt(r),!ae(i))?-1:r}function gg(e,t){const u=e.level+2;for(let n=t+2,r=e.tokens.length-2;n<r;n++)e.tokens[n].level===u&&e.tokens[n].type==="paragraph_open"&&(e.tokens[n+2].hidden=!0,e.tokens[n].hidden=!0,n+=2)}function _g(e,t,u,n){let r,i,s,o,c=t,l=!0;if(e.sCount[c]-e.blkIndent>=4||e.listIndent>=0&&e.sCount[c]-e.listIndent>=4&&e.sCount[c]<e.blkIndent)return!1;let a=!1;n&&e.parentType==="paragraph"&&e.sCount[c]>=e.blkIndent&&(a=!0);let f,p,d;if((d=dl(e,c))>=0){if(f=!0,s=e.bMarks[c]+e.tShift[c],p=Number(e.src.slice(s,d-1)),a&&p!==1)return!1}else if((d=fl(e,c))>=0)f=!1;else return!1;if(a&&e.skipSpaces(d)>=e.eMarks[c])return!1;if(n)return!0;const h=e.src.charCodeAt(d-1),_=e.tokens.length;f?(o=e.push("ordered_list_open","ol",1),p!==1&&(o.attrs=[["start",p]])):o=e.push("bullet_list_open","ul",1);const v=[c,0];o.map=v,o.markup=String.fromCharCode(h);let A=!1;const E=e.md.block.ruler.getRules("list"),m=e.parentType;for(e.parentType="list";c<u;){i=d,r=e.eMarks[c];const g=e.sCount[c]+d-(e.bMarks[c]+e.tShift[c]);let x=g;for(;i<r;){const z=e.src.charCodeAt(i);if(z===9)x+=4-(x+e.bsCount[c])%4;else if(z===32)x++;else break;i++}const k=i;let I;k>=r?I=1:I=x-g,I>4&&(I=1);const w=g+I;o=e.push("list_item_open","li",1),o.markup=String.fromCharCode(h);const C=[c,0];o.map=C,f&&(o.info=e.src.slice(s,d-1));const S=e.tight,R=e.tShift[c],T=e.sCount[c],P=e.listIndent;if(e.listIndent=e.blkIndent,e.blkIndent=w,e.tight=!0,e.tShift[c]=k-e.bMarks[c],e.sCount[c]=x,k>=r&&e.isEmpty(c+1)?e.line=Math.min(e.line+2,u):e.md.block.tokenize(e,c,u,!0),(!e.tight||A)&&(l=!1),A=e.line-c>1&&e.isEmpty(e.line-1),e.blkIndent=e.listIndent,e.listIndent=P,e.tShift[c]=R,e.sCount[c]=T,e.tight=S,o=e.push("list_item_close","li",-1),o.markup=String.fromCharCode(h),c=e.line,C[1]=c,c>=u||e.sCount[c]<e.blkIndent||e.sCount[c]-e.blkIndent>=4)break;let q=!1;for(let z=0,H=E.length;z<H;z++)if(E[z](e,c,u,!0)){q=!0;break}if(q)break;if(f){if(d=dl(e,c),d<0)break;s=e.bMarks[c]+e.tShift[c]}else if(d=fl(e,c),d<0)break;if(h!==e.src.charCodeAt(d-1))break}return f?o=e.push("ordered_list_close","ol",-1):o=e.push("bullet_list_close","ul",-1),o.markup=String.fromCharCode(h),v[1]=c,e.line=c,e.parentType=m,l&&gg(e,_),!0}function yg(e,t,u,n){let r=e.bMarks[t]+e.tShift[t],i=e.eMarks[t],s=t+1;if(e.sCount[t]-e.blkIndent>=4||e.src.charCodeAt(r)!==91)return!1;function o(E){const m=e.lineMax;if(E>=m||e.isEmpty(E))return null;let g=!1;if(e.sCount[E]-e.blkIndent>3&&(g=!0),e.sCount[E]<0&&(g=!0),!g){const I=e.md.block.ruler.getRules("reference"),w=e.parentType;e.parentType="reference";let C=!1;for(let S=0,R=I.length;S<R;S++)if(I[S](e,E,m,!0)){C=!0;break}if(e.parentType=w,C)return null}const x=e.bMarks[E]+e.tShift[E],k=e.eMarks[E];return e.src.slice(x,k+1)}let c=e.src.slice(r,i+1);i=c.length;let l=-1;for(r=1;r<i;r++){const E=c.charCodeAt(r);if(E===91)return!1;if(E===93){l=r;break}else if(E===10){const m=o(s);m!==null&&(c+=m,i=c.length,s++)}else if(E===92&&(r++,r<i&&c.charCodeAt(r)===10)){const m=o(s);m!==null&&(c+=m,i=c.length,s++)}}if(l<0||c.charCodeAt(l+1)!==58)return!1;for(r=l+2;r<i;r++){const E=c.charCodeAt(r);if(E===10){const m=o(s);m!==null&&(c+=m,i=c.length,s++)}else if(!ae(E))break}const a=e.md.helpers.parseLinkDestination(c,r,i);if(!a.ok)return!1;const f=e.md.normalizeLink(a.str);if(!e.md.validateLink(f))return!1;r=a.pos;const p=r,d=s,h=r;for(;r<i;r++){const E=c.charCodeAt(r);if(E===10){const m=o(s);m!==null&&(c+=m,i=c.length,s++)}else if(!ae(E))break}let _=e.md.helpers.parseLinkTitle(c,r,i);for(;_.can_continue;){const E=o(s);if(E===null)break;c+=E,r=i,i=c.length,s++,_=e.md.helpers.parseLinkTitle(c,r,i,_)}let v;for(r<i&&h!==r&&_.ok?(v=_.str,r=_.pos):(v="",r=p,s=d);r<i;){const E=c.charCodeAt(r);if(!ae(E))break;r++}if(r<i&&c.charCodeAt(r)!==10&&v)for(v="",r=p,s=d;r<i;){const E=c.charCodeAt(r);if(!ae(E))break;r++}if(r<i&&c.charCodeAt(r)!==10)return!1;const A=Di(c.slice(1,l));return A?(n||(typeof e.env.references>"u"&&(e.env.references={}),typeof e.env.references[A]>"u"&&(e.env.references[A]={title:v,href:f}),e.line=s),!0):!1}const xg=["address","article","aside","base","basefont","blockquote","body","caption","center","col","colgroup","dd","details","dialog","dir","div","dl","dt","fieldset","figcaption","figure","footer","form","frame","frameset","h1","h2","h3","h4","h5","h6","head","header","hr","html","iframe","legend","li","link","main","menu","menuitem","nav","noframes","ol","optgroup","option","p","param","search","section","summary","table","tbody","td","tfoot","th","thead","title","tr","track","ul"],Eg="[a-zA-Z_:][a-zA-Z0-9:._-]*",Cg="[^\"'=<>`\\x00-\\x20]+",kg="'[^']*'",Ag='"[^"]*"',Sg="(?:"+Cg+"|"+kg+"|"+Ag+")",vg="(?:\\s+"+Eg+"(?:\\s*=\\s*"+Sg+")?)",kf="<[A-Za-z][A-Za-z0-9\\-]*"+vg+"*\\s*\\/?>",Af="<\\/[A-Za-z][A-Za-z0-9\\-]*\\s*>",Dg="\x3C!---?>|\x3C!--(?:[^-]|-[^-]|--[^>])*-->",Tg="<[?][\\s\\S]*?[?]>",wg="<![A-Za-z][^>]*>",Fg="<!\\[CDATA\\[[\\s\\S]*?\\]\\]>",Ig=new RegExp("^(?:"+kf+"|"+Af+"|"+Dg+"|"+Tg+"|"+wg+"|"+Fg+")"),Og=new RegExp("^(?:"+kf+"|"+Af+")"),Ru=[[/^<(script|pre|style|textarea)(?=(\s|>|$))/i,/<\/(script|pre|style|textarea)>/i,!0],[/^\x3C!--/,/-->/,!0],[/^<\?/,/\?>/,!0],[/^<![A-Z]/,/>/,!0],[/^<!\[CDATA\[/,/\]\]>/,!0],[new RegExp("^</?("+xg.join("|")+")(?=(\\s|/?>|$))","i"),/^$/,!0],[new RegExp(Og.source+"\\s*$"),/^$/,!1]];function Ng(e,t,u,n){let r=e.bMarks[t]+e.tShift[t],i=e.eMarks[t];if(e.sCount[t]-e.blkIndent>=4||!e.md.options.html||e.src.charCodeAt(r)!==60)return!1;let s=e.src.slice(r,i),o=0;for(;o<Ru.length&&!Ru[o][0].test(s);o++);if(o===Ru.length)return!1;if(n)return Ru[o][2];let c=t+1;if(!Ru[o][1].test(s)){for(;c<u&&!(e.sCount[c]<e.blkIndent);c++)if(r=e.bMarks[c]+e.tShift[c],i=e.eMarks[c],s=e.src.slice(r,i),Ru[o][1].test(s)){s.length!==0&&c++;break}}e.line=c;const l=e.push("html_block","",0);return l.map=[t,c],l.content=e.getLines(t,c,e.blkIndent,!0),!0}function Rg(e,t,u,n){let r=e.bMarks[t]+e.tShift[t],i=e.eMarks[t];if(e.sCount[t]-e.blkIndent>=4)return!1;let s=e.src.charCodeAt(r);if(s!==35||r>=i)return!1;let o=1;for(s=e.src.charCodeAt(++r);s===35&&r<i&&o<=6;)o++,s=e.src.charCodeAt(++r);if(o>6||r<i&&!ae(s))return!1;if(n)return!0;i=e.skipSpacesBack(i,r);const c=e.skipCharsBack(i,35,r);c>r&&ae(e.src.charCodeAt(c-1))&&(i=c),e.line=t+1;const l=e.push("heading_open","h"+String(o),1);l.markup="########".slice(0,o),l.map=[t,e.line];const a=e.push("inline","",0);a.content=e.src.slice(r,i).trim(),a.map=[t,e.line],a.children=[];const f=e.push("heading_close","h"+String(o),-1);return f.markup="########".slice(0,o),!0}function Mg(e,t,u){const n=e.md.block.ruler.getRules("paragraph");if(e.sCount[t]-e.blkIndent>=4)return!1;const r=e.parentType;e.parentType="paragraph";let i=0,s,o=t+1;for(;o<u&&!e.isEmpty(o);o++){if(e.sCount[o]-e.blkIndent>3)continue;if(e.sCount[o]>=e.blkIndent){let d=e.bMarks[o]+e.tShift[o];const h=e.eMarks[o];if(d<h&&(s=e.src.charCodeAt(d),(s===45||s===61)&&(d=e.skipChars(d,s),d=e.skipSpaces(d),d>=h))){i=s===61?1:2;break}}if(e.sCount[o]<0)continue;let p=!1;for(let d=0,h=n.length;d<h;d++)if(n[d](e,o,u,!0)){p=!0;break}if(p)break}if(!i)return!1;const c=e.getLines(t,o,e.blkIndent,!1).trim();e.line=o+1;const l=e.push("heading_open","h"+String(i),1);l.markup=String.fromCharCode(s),l.map=[t,e.line];const a=e.push("inline","",0);a.content=c,a.map=[t,e.line-1],a.children=[];const f=e.push("heading_close","h"+String(i),-1);return f.markup=String.fromCharCode(s),e.parentType=r,!0}function Pg(e,t,u){const n=e.md.block.ruler.getRules("paragraph"),r=e.parentType;let i=t+1;for(e.parentType="paragraph";i<u&&!e.isEmpty(i);i++){if(e.sCount[i]-e.blkIndent>3||e.sCount[i]<0)continue;let l=!1;for(let a=0,f=n.length;a<f;a++)if(n[a](e,i,u,!0)){l=!0;break}if(l)break}const s=e.getLines(t,i,e.blkIndent,!1).trim();e.line=i;const o=e.push("paragraph_open","p",1);o.map=[t,e.line];const c=e.push("inline","",0);return c.content=s,c.map=[t,e.line],c.children=[],e.push("paragraph_close","p",-1),e.parentType=r,!0}const br=[["table",dg,["paragraph","reference"]],["code",hg],["fence",pg,["paragraph","reference","blockquote","list"]],["blockquote",mg,["paragraph","reference","blockquote","list"]],["hr",bg,["paragraph","reference","blockquote","list"]],["list",_g,["paragraph","reference","blockquote"]],["reference",yg],["html_block",Ng,["paragraph","reference","blockquote"]],["heading",Rg,["paragraph","reference","blockquote"]],["lheading",Mg],["paragraph",Pg]];function Ti(){this.ruler=new Je;for(let e=0;e<br.length;e++)this.ruler.push(br[e][0],br[e][1],{alt:(br[e][2]||[]).slice()})}Ti.prototype.tokenize=function(e,t,u){const n=this.ruler.getRules(""),r=n.length,i=e.md.options.maxNesting;let s=t,o=!1;for(;s<u&&(e.line=s=e.skipEmptyLines(s),!(s>=u||e.sCount[s]<e.blkIndent));){if(e.level>=i){e.line=u;break}const c=e.line;let l=!1;for(let a=0;a<r;a++)if(l=n[a](e,s,u,!1),l){if(c>=e.line)throw new Error("block rule didn't increment state.line");break}if(!l)throw new Error("none of the block rules matched");e.tight=!o,e.isEmpty(e.line-1)&&(o=!0),s=e.line,s<u&&e.isEmpty(s)&&(o=!0,s++,e.line=s)}};Ti.prototype.parse=function(e,t,u,n){if(!e)return;const r=new this.State(e,t,u,n);this.tokenize(r,r.line,r.lineMax)};Ti.prototype.State=Tt;function tr(e,t,u,n){this.src=e,this.env=u,this.md=t,this.tokens=n,this.tokens_meta=Array(n.length),this.pos=0,this.posMax=this.src.length,this.level=0,this.pending="",this.pendingLevel=0,this.cache={},this.delimiters=[],this._prev_delimiters=[],this.backticks={},this.backticksScanned=!1,this.linkLevel=0}tr.prototype.pushPending=function(){const e=new _t("text","",0);return e.content=this.pending,e.level=this.pendingLevel,this.tokens.push(e),this.pending="",e};tr.prototype.push=function(e,t,u){this.pending&&this.pushPending();const n=new _t(e,t,u);let r=null;return u<0&&(this.level--,this.delimiters=this._prev_delimiters.pop()),n.level=this.level,u>0&&(this.level++,this._prev_delimiters.push(this.delimiters),this.delimiters=[],r={delimiters:this.delimiters}),this.pendingLevel=this.level,this.tokens.push(n),this.tokens_meta.push(r),n};tr.prototype.scanDelims=function(e,t){const u=this.posMax,n=this.src.charCodeAt(e),r=e>0?this.src.charCodeAt(e-1):32;let i=e;for(;i<u&&this.src.charCodeAt(i)===n;)i++;const s=i-e,o=i<u?this.src.charCodeAt(i):32,c=jn(r)||qn(String.fromCharCode(r)),l=jn(o)||qn(String.fromCharCode(o)),a=Un(r),f=Un(o),p=!f&&(!l||a||c),d=!a&&(!c||f||l);return{can_open:p&&(t||!d||c),can_close:d&&(t||!p||l),length:s}};tr.prototype.Token=_t;function Lg(e){switch(e){case 10:case 33:case 35:case 36:case 37:case 38:case 42:case 43:case 45:case 58:case 60:case 61:case 62:case 64:case 91:case 92:case 93:case 94:case 95:case 96:case 123:case 125:case 126:return!0;default:return!1}}function Bg(e,t){let u=e.pos;for(;u<e.posMax&&!Lg(e.src.charCodeAt(u));)u++;return u===e.pos?!1:(t||(e.pending+=e.src.slice(e.pos,u)),e.pos=u,!0)}const Vg=/(?:^|[^a-z0-9.+-])([a-z][a-z0-9.+-]*)$/i;function $g(e,t){if(!e.md.options.linkify||e.linkLevel>0)return!1;const u=e.pos,n=e.posMax;if(u+3>n||e.src.charCodeAt(u)!==58||e.src.charCodeAt(u+1)!==47||e.src.charCodeAt(u+2)!==47)return!1;const r=e.pending.match(Vg);if(!r)return!1;const i=r[1],s=e.md.linkify.matchAtStart(e.src.slice(u-i.length));if(!s)return!1;let o=s.url;if(o.length<=i.length)return!1;o=o.replace(/\*+$/,"");const c=e.md.normalizeLink(o);if(!e.md.validateLink(c))return!1;if(!t){e.pending=e.pending.slice(0,-i.length);const l=e.push("link_open","a",1);l.attrs=[["href",c]],l.markup="linkify",l.info="auto";const a=e.push("text","",0);a.content=e.md.normalizeLinkText(o);const f=e.push("link_close","a",-1);f.markup="linkify",f.info="auto"}return e.pos+=o.length-i.length,!0}function Hg(e,t){let u=e.pos;if(e.src.charCodeAt(u)!==10)return!1;const n=e.pending.length-1,r=e.posMax;if(!t)if(n>=0&&e.pending.charCodeAt(n)===32)if(n>=1&&e.pending.charCodeAt(n-1)===32){let i=n-1;for(;i>=1&&e.pending.charCodeAt(i-1)===32;)i--;e.pending=e.pending.slice(0,i),e.push("hardbreak","br",0)}else e.pending=e.pending.slice(0,-1),e.push("softbreak","br",0);else e.push("softbreak","br",0);for(u++;u<r&&ae(e.src.charCodeAt(u));)u++;return e.pos=u,!0}const $o=[];for(let e=0;e<256;e++)$o.push(0);"\\!\"#$%&'()*+,./:;<=>?@[]^_`{|}~-".split("").forEach(function(e){$o[e.charCodeAt(0)]=1});function Ug(e,t){let u=e.pos;const n=e.posMax;if(e.src.charCodeAt(u)!==92||(u++,u>=n))return!1;let r=e.src.charCodeAt(u);if(r===10){for(t||e.push("hardbreak","br",0),u++;u<n&&(r=e.src.charCodeAt(u),!!ae(r));)u++;return e.pos=u,!0}let i=e.src[u];if(r>=55296&&r<=56319&&u+1<n){const o=e.src.charCodeAt(u+1);o>=56320&&o<=57343&&(i+=e.src[u+1],u++)}const s="\\"+i;if(!t){const o=e.push("text_special","",0);r<256&&$o[r]!==0?o.content=i:o.content=s,o.markup=s,o.info="escape"}return e.pos=u+1,!0}function qg(e,t){let u=e.pos;if(e.src.charCodeAt(u)!==96)return!1;const r=u;u++;const i=e.posMax;for(;u<i&&e.src.charCodeAt(u)===96;)u++;const s=e.src.slice(r,u),o=s.length;if(e.backticksScanned&&(e.backticks[o]||0)<=r)return t||(e.pending+=s),e.pos+=o,!0;let c=u,l;for(;(l=e.src.indexOf("`",c))!==-1;){for(c=l+1;c<i&&e.src.charCodeAt(c)===96;)c++;const a=c-l;if(a===o){if(!t){const f=e.push("code_inline","code",0);f.markup=s,f.content=e.src.slice(u,l).replace(/\n/g," ").replace(/^ (.+) $/,"$1")}return e.pos=c,!0}e.backticks[a]=l}return e.backticksScanned=!0,t||(e.pending+=s),e.pos+=o,!0}function jg(e,t){const u=e.pos,n=e.src.charCodeAt(u);if(t||n!==126)return!1;const r=e.scanDelims(e.pos,!0);let i=r.length;const s=String.fromCharCode(n);if(i<2)return!1;let o;i%2&&(o=e.push("text","",0),o.content=s,i--);for(let c=0;c<i;c+=2)o=e.push("text","",0),o.content=s+s,e.delimiters.push({marker:n,length:0,token:e.tokens.length-1,end:-1,open:r.can_open,close:r.can_close});return e.pos+=r.length,!0}function hl(e,t){let u;const n=[],r=t.length;for(let i=0;i<r;i++){const s=t[i];if(s.marker!==126||s.end===-1)continue;const o=t[s.end];u=e.tokens[s.token],u.type="s_open",u.tag="s",u.nesting=1,u.markup="~~",u.content="",u=e.tokens[o.token],u.type="s_close",u.tag="s",u.nesting=-1,u.markup="~~",u.content="",e.tokens[o.token-1].type==="text"&&e.tokens[o.token-1].content==="~"&&n.push(o.token-1)}for(;n.length;){const i=n.pop();let s=i+1;for(;s<e.tokens.length&&e.tokens[s].type==="s_close";)s++;s--,i!==s&&(u=e.tokens[s],e.tokens[s]=e.tokens[i],e.tokens[i]=u)}}function zg(e){const t=e.tokens_meta,u=e.tokens_meta.length;hl(e,e.delimiters);for(let n=0;n<u;n++)t[n]&&t[n].delimiters&&hl(e,t[n].delimiters)}const Sf={tokenize:jg,postProcess:zg};function Kg(e,t){const u=e.pos,n=e.src.charCodeAt(u);if(t||n!==95&&n!==42)return!1;const r=e.scanDelims(e.pos,n===42);for(let i=0;i<r.length;i++){const s=e.push("text","",0);s.content=String.fromCharCode(n),e.delimiters.push({marker:n,length:r.length,token:e.tokens.length-1,end:-1,open:r.can_open,close:r.can_close})}return e.pos+=r.length,!0}function pl(e,t){const u=t.length;for(let n=u-1;n>=0;n--){const r=t[n];if(r.marker!==95&&r.marker!==42||r.end===-1)continue;const i=t[r.end],s=n>0&&t[n-1].end===r.end+1&&t[n-1].marker===r.marker&&t[n-1].token===r.token-1&&t[r.end+1].token===i.token+1,o=String.fromCharCode(r.marker),c=e.tokens[r.token];c.type=s?"strong_open":"em_open",c.tag=s?"strong":"em",c.nesting=1,c.markup=s?o+o:o,c.content="";const l=e.tokens[i.token];l.type=s?"strong_close":"em_close",l.tag=s?"strong":"em",l.nesting=-1,l.markup=s?o+o:o,l.content="",s&&(e.tokens[t[n-1].token].content="",e.tokens[t[r.end+1].token].content="",n--)}}function Wg(e){const t=e.tokens_meta,u=e.tokens_meta.length;pl(e,e.delimiters);for(let n=0;n<u;n++)t[n]&&t[n].delimiters&&pl(e,t[n].delimiters)}const vf={tokenize:Kg,postProcess:Wg};function Gg(e,t){let u,n,r,i,s="",o="",c=e.pos,l=!0;if(e.src.charCodeAt(e.pos)!==91)return!1;const a=e.pos,f=e.posMax,p=e.pos+1,d=e.md.helpers.parseLinkLabel(e,e.pos,!0);if(d<0)return!1;let h=d+1;if(h<f&&e.src.charCodeAt(h)===40){for(l=!1,h++;h<f&&(u=e.src.charCodeAt(h),!(!ae(u)&&u!==10));h++);if(h>=f)return!1;if(c=h,r=e.md.helpers.parseLinkDestination(e.src,h,e.posMax),r.ok){for(s=e.md.normalizeLink(r.str),e.md.validateLink(s)?h=r.pos:s="",c=h;h<f&&(u=e.src.charCodeAt(h),!(!ae(u)&&u!==10));h++);if(r=e.md.helpers.parseLinkTitle(e.src,h,e.posMax),h<f&&c!==h&&r.ok)for(o=r.str,h=r.pos;h<f&&(u=e.src.charCodeAt(h),!(!ae(u)&&u!==10));h++);}(h>=f||e.src.charCodeAt(h)!==41)&&(l=!0),h++}if(l){if(typeof e.env.references>"u")return!1;if(h<f&&e.src.charCodeAt(h)===91?(c=h+1,h=e.md.helpers.parseLinkLabel(e,h),h>=0?n=e.src.slice(c,h++):h=d+1):h=d+1,n||(n=e.src.slice(p,d)),i=e.env.references[Di(n)],!i)return e.pos=a,!1;s=i.href,o=i.title}if(!t){e.pos=p,e.posMax=d;const _=e.push("link_open","a",1),v=[["href",s]];_.attrs=v,o&&v.push(["title",o]),e.linkLevel++,e.md.inline.tokenize(e),e.linkLevel--,e.push("link_close","a",-1)}return e.pos=h,e.posMax=f,!0}function Zg(e,t){let u,n,r,i,s,o,c,l,a="";const f=e.pos,p=e.posMax;if(e.src.charCodeAt(e.pos)!==33||e.src.charCodeAt(e.pos+1)!==91)return!1;const d=e.pos+2,h=e.md.helpers.parseLinkLabel(e,e.pos+1,!1);if(h<0)return!1;if(i=h+1,i<p&&e.src.charCodeAt(i)===40){for(i++;i<p&&(u=e.src.charCodeAt(i),!(!ae(u)&&u!==10));i++);if(i>=p)return!1;for(l=i,o=e.md.helpers.parseLinkDestination(e.src,i,e.posMax),o.ok&&(a=e.md.normalizeLink(o.str),e.md.validateLink(a)?i=o.pos:a=""),l=i;i<p&&(u=e.src.charCodeAt(i),!(!ae(u)&&u!==10));i++);if(o=e.md.helpers.parseLinkTitle(e.src,i,e.posMax),i<p&&l!==i&&o.ok)for(c=o.str,i=o.pos;i<p&&(u=e.src.charCodeAt(i),!(!ae(u)&&u!==10));i++);else c="";if(i>=p||e.src.charCodeAt(i)!==41)return e.pos=f,!1;i++}else{if(typeof e.env.references>"u")return!1;if(i<p&&e.src.charCodeAt(i)===91?(l=i+1,i=e.md.helpers.parseLinkLabel(e,i),i>=0?r=e.src.slice(l,i++):i=h+1):i=h+1,r||(r=e.src.slice(d,h)),s=e.env.references[Di(r)],!s)return e.pos=f,!1;a=s.href,c=s.title}if(!t){n=e.src.slice(d,h);const _=[];e.md.inline.parse(n,e.md,e.env,_);const v=e.push("image","img",0),A=[["src",a],["alt",""]];v.attrs=A,v.children=_,v.content=n,c&&A.push(["title",c])}return e.pos=i,e.posMax=p,!0}const Jg=/^([a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)$/,Yg=/^([a-zA-Z][a-zA-Z0-9+.-]{1,31}):([^<>\x00-\x20]*)$/;function Xg(e,t){let u=e.pos;if(e.src.charCodeAt(u)!==60)return!1;const n=e.pos,r=e.posMax;for(;;){if(++u>=r)return!1;const s=e.src.charCodeAt(u);if(s===60)return!1;if(s===62)break}const i=e.src.slice(n+1,u);if(Yg.test(i)){const s=e.md.normalizeLink(i);if(!e.md.validateLink(s))return!1;if(!t){const o=e.push("link_open","a",1);o.attrs=[["href",s]],o.markup="autolink",o.info="auto";const c=e.push("text","",0);c.content=e.md.normalizeLinkText(i);const l=e.push("link_close","a",-1);l.markup="autolink",l.info="auto"}return e.pos+=i.length+2,!0}if(Jg.test(i)){const s=e.md.normalizeLink("mailto:"+i);if(!e.md.validateLink(s))return!1;if(!t){const o=e.push("link_open","a",1);o.attrs=[["href",s]],o.markup="autolink",o.info="auto";const c=e.push("text","",0);c.content=e.md.normalizeLinkText(i);const l=e.push("link_close","a",-1);l.markup="autolink",l.info="auto"}return e.pos+=i.length+2,!0}return!1}function Qg(e){return/^<a[>\s]/i.test(e)}function e3(e){return/^<\/a\s*>/i.test(e)}function t3(e){const t=e|32;return t>=97&&t<=122}function u3(e,t){if(!e.md.options.html)return!1;const u=e.posMax,n=e.pos;if(e.src.charCodeAt(n)!==60||n+2>=u)return!1;const r=e.src.charCodeAt(n+1);if(r!==33&&r!==63&&r!==47&&!t3(r))return!1;const i=e.src.slice(n).match(Ig);if(!i)return!1;if(!t){const s=e.push("html_inline","",0);s.content=i[0],Qg(s.content)&&e.linkLevel++,e3(s.content)&&e.linkLevel--}return e.pos+=i[0].length,!0}const n3=/^&#((?:x[a-f0-9]{1,6}|[0-9]{1,7}));/i,r3=/^&([a-z][a-z0-9]{1,31});/i;function i3(e,t){const u=e.pos,n=e.posMax;if(e.src.charCodeAt(u)!==38||u+1>=n)return!1;if(e.src.charCodeAt(u+1)===35){const i=e.src.slice(u).match(n3);if(i){if(!t){const s=i[1][0].toLowerCase()==="x"?parseInt(i[1].slice(1),16):parseInt(i[1],10),o=e.push("text_special","",0);o.content=Bo(s)?Xr(s):Xr(65533),o.markup=i[0],o.info="entity"}return e.pos+=i[0].length,!0}}else{const i=e.src.slice(u).match(r3);if(i){const s=_f(i[0]);if(s!==i[0]){if(!t){const o=e.push("text_special","",0);o.content=s,o.markup=i[0],o.info="entity"}return e.pos+=i[0].length,!0}}}return!1}function ml(e){const t={},u=e.length;if(!u)return;let n=0,r=-2;const i=[];for(let s=0;s<u;s++){const o=e[s];if(i.push(0),(e[n].marker!==o.marker||r!==o.token-1)&&(n=s),r=o.token,o.length=o.length||0,!o.close)continue;t.hasOwnProperty(o.marker)||(t[o.marker]=[-1,-1,-1,-1,-1,-1]);const c=t[o.marker][(o.open?3:0)+o.length%3];let l=n-i[n]-1,a=l;for(;l>c;l-=i[l]+1){const f=e[l];if(f.marker===o.marker&&f.open&&f.end<0){let p=!1;if((f.close||o.open)&&(f.length+o.length)%3===0&&(f.length%3!==0||o.length%3!==0)&&(p=!0),!p){const d=l>0&&!e[l-1].open?i[l-1]+1:0;i[s]=s-l+d,i[l]=d,o.open=!1,f.end=s,f.close=!1,a=-1,r=-2;break}}}a!==-1&&(t[o.marker][(o.open?3:0)+(o.length||0)%3]=a)}}function s3(e){const t=e.tokens_meta,u=e.tokens_meta.length;ml(e.delimiters);for(let n=0;n<u;n++)t[n]&&t[n].delimiters&&ml(t[n].delimiters)}function o3(e){let t,u,n=0;const r=e.tokens,i=e.tokens.length;for(t=u=0;t<i;t++)r[t].nesting<0&&n--,r[t].level=n,r[t].nesting>0&&n++,r[t].type==="text"&&t+1<i&&r[t+1].type==="text"?r[t+1].content=r[t].content+r[t+1].content:(t!==u&&(r[u]=r[t]),u++);t!==u&&(r.length=u)}const Yi=[["text",Bg],["linkify",$g],["newline",Hg],["escape",Ug],["backticks",qg],["strikethrough",Sf.tokenize],["emphasis",vf.tokenize],["link",Gg],["image",Zg],["autolink",Xg],["html_inline",u3],["entity",i3]],Xi=[["balance_pairs",s3],["strikethrough",Sf.postProcess],["emphasis",vf.postProcess],["fragments_join",o3]];function ur(){this.ruler=new Je;for(let e=0;e<Yi.length;e++)this.ruler.push(Yi[e][0],Yi[e][1]);this.ruler2=new Je;for(let e=0;e<Xi.length;e++)this.ruler2.push(Xi[e][0],Xi[e][1])}ur.prototype.skipToken=function(e){const t=e.pos,u=this.ruler.getRules(""),n=u.length,r=e.md.options.maxNesting,i=e.cache;if(typeof i[t]<"u"){e.pos=i[t];return}let s=!1;if(e.level<r){for(let o=0;o<n;o++)if(e.level++,s=u[o](e,!0),e.level--,s){if(t>=e.pos)throw new Error("inline rule didn't increment state.pos");break}}else e.pos=e.posMax;s||e.pos++,i[t]=e.pos};ur.prototype.tokenize=function(e){const t=this.ruler.getRules(""),u=t.length,n=e.posMax,r=e.md.options.maxNesting;for(;e.pos<n;){const i=e.pos;let s=!1;if(e.level<r){for(let o=0;o<u;o++)if(s=t[o](e,!1),s){if(i>=e.pos)throw new Error("inline rule didn't increment state.pos");break}}if(s){if(e.pos>=n)break;continue}e.pending+=e.src[e.pos++]}e.pending&&e.pushPending()};ur.prototype.parse=function(e,t,u,n){const r=new this.State(e,t,u,n);this.tokenize(r);const i=this.ruler2.getRules(""),s=i.length;for(let o=0;o<s;o++)i[o](r)};ur.prototype.State=tr;function c3(e){const t={};e=e||{},t.src_Any=hf.source,t.src_Cc=pf.source,t.src_Z=bf.source,t.src_P=Po.source,t.src_ZPCc=[t.src_Z,t.src_P,t.src_Cc].join("|"),t.src_ZCc=[t.src_Z,t.src_Cc].join("|");const u="[><｜]";return t.src_pseudo_letter="(?:(?!"+u+"|"+t.src_ZPCc+")"+t.src_Any+")",t.src_ip4="(?:(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)",t.src_auth="(?:(?:(?!"+t.src_ZCc+"|[@/\\[\\]()]).)+@)?",t.src_port="(?::(?:6(?:[0-4]\\d{3}|5(?:[0-4]\\d{2}|5(?:[0-2]\\d|3[0-5])))|[1-5]?\\d{1,4}))?",t.src_host_terminator="(?=$|"+u+"|"+t.src_ZPCc+")(?!"+(e["---"]?"-(?!--)|":"-|")+"_|:\\d|\\.-|\\.(?!$|"+t.src_ZPCc+"))",t.src_path="(?:[/?#](?:(?!"+t.src_ZCc+"|"+u+`|[()[\\]{}.,"'?!\\-;]).|\\[(?:(?!`+t.src_ZCc+"|\\]).)*\\]|\\((?:(?!"+t.src_ZCc+"|[)]).)*\\)|\\{(?:(?!"+t.src_ZCc+'|[}]).)*\\}|\\"(?:(?!'+t.src_ZCc+`|["]).)+\\"|\\'(?:(?!`+t.src_ZCc+"|[']).)+\\'|\\'(?="+t.src_pseudo_letter+"|[-])|\\.{2,}[a-zA-Z0-9%/&]|\\.(?!"+t.src_ZCc+"|[.]|$)|"+(e["---"]?"\\-(?!--(?:[^-]|$))(?:-*)|":"\\-+|")+",(?!"+t.src_ZCc+"|$)|;(?!"+t.src_ZCc+"|$)|\\!+(?!"+t.src_ZCc+"|[!]|$)|\\?(?!"+t.src_ZCc+"|[?]|$))+|\\/)?",t.src_email_name='[\\-;:&=\\+\\$,\\.a-zA-Z0-9_][\\-;:&=\\+\\$,\\"\\.a-zA-Z0-9_]*',t.src_xn="xn--[a-z0-9\\-]{1,59}",t.src_domain_root="(?:"+t.src_xn+"|"+t.src_pseudo_letter+"{1,63})",t.src_domain="(?:"+t.src_xn+"|(?:"+t.src_pseudo_letter+")|(?:"+t.src_pseudo_letter+"(?:-|"+t.src_pseudo_letter+"){0,61}"+t.src_pseudo_letter+"))",t.src_host="(?:(?:(?:(?:"+t.src_domain+")\\.)*"+t.src_domain+"))",t.tpl_host_fuzzy="(?:"+t.src_ip4+"|(?:(?:(?:"+t.src_domain+")\\.)+(?:%TLDS%)))",t.tpl_host_no_ip_fuzzy="(?:(?:(?:"+t.src_domain+")\\.)+(?:%TLDS%))",t.src_host_strict=t.src_host+t.src_host_terminator,t.tpl_host_fuzzy_strict=t.tpl_host_fuzzy+t.src_host_terminator,t.src_host_port_strict=t.src_host+t.src_port+t.src_host_terminator,t.tpl_host_port_fuzzy_strict=t.tpl_host_fuzzy+t.src_port+t.src_host_terminator,t.tpl_host_port_no_ip_fuzzy_strict=t.tpl_host_no_ip_fuzzy+t.src_port+t.src_host_terminator,t.tpl_host_fuzzy_test="localhost|www\\.|\\.\\d{1,3}\\.|(?:\\.(?:%TLDS%)(?:"+t.src_ZPCc+"|>|$))",t.tpl_email_fuzzy="(^|"+u+'|"|\\(|'+t.src_ZCc+")("+t.src_email_name+"@"+t.tpl_host_fuzzy_strict+")",t.tpl_link_fuzzy="(^|(?![.:/\\-_@])(?:[$+<=>^`|｜]|"+t.src_ZPCc+"))((?![$+<=>^`|｜])"+t.tpl_host_port_fuzzy_strict+t.src_path+")",t.tpl_link_no_ip_fuzzy="(^|(?![.:/\\-_@])(?:[$+<=>^`|｜]|"+t.src_ZPCc+"))((?![$+<=>^`|｜])"+t.tpl_host_port_no_ip_fuzzy_strict+t.src_path+")",t}function Os(e){return Array.prototype.slice.call(arguments,1).forEach(function(u){u&&Object.keys(u).forEach(function(n){e[n]=u[n]})}),e}function wi(e){return Object.prototype.toString.call(e)}function l3(e){return wi(e)==="[object String]"}function a3(e){return wi(e)==="[object Object]"}function f3(e){return wi(e)==="[object RegExp]"}function bl(e){return wi(e)==="[object Function]"}function d3(e){return e.replace(/[.?*+^$[\]\\(){}|-]/g,"\\$&")}const Df={fuzzyLink:!0,fuzzyEmail:!0,fuzzyIP:!1};function h3(e){return Object.keys(e||{}).reduce(function(t,u){return t||Df.hasOwnProperty(u)},!1)}const p3={"http:":{validate:function(e,t,u){const n=e.slice(t);return u.re.http||(u.re.http=new RegExp("^\\/\\/"+u.re.src_auth+u.re.src_host_port_strict+u.re.src_path,"i")),u.re.http.test(n)?n.match(u.re.http)[0].length:0}},"https:":"http:","ftp:":"http:","//":{validate:function(e,t,u){const n=e.slice(t);return u.re.no_http||(u.re.no_http=new RegExp("^"+u.re.src_auth+"(?:localhost|(?:(?:"+u.re.src_domain+")\\.)+"+u.re.src_domain_root+")"+u.re.src_port+u.re.src_host_terminator+u.re.src_path,"i")),u.re.no_http.test(n)?t>=3&&e[t-3]===":"||t>=3&&e[t-3]==="/"?0:n.match(u.re.no_http)[0].length:0}},"mailto:":{validate:function(e,t,u){const n=e.slice(t);return u.re.mailto||(u.re.mailto=new RegExp("^"+u.re.src_email_name+"@"+u.re.src_host_strict,"i")),u.re.mailto.test(n)?n.match(u.re.mailto)[0].length:0}}},m3="a[cdefgilmnoqrstuwxz]|b[abdefghijmnorstvwyz]|c[acdfghiklmnoruvwxyz]|d[ejkmoz]|e[cegrstu]|f[ijkmor]|g[abdefghilmnpqrstuwy]|h[kmnrtu]|i[delmnoqrst]|j[emop]|k[eghimnprwyz]|l[abcikrstuvy]|m[acdeghklmnopqrstuvwxyz]|n[acefgilopruz]|om|p[aefghklmnrstwy]|qa|r[eosuw]|s[abcdeghijklmnortuvxyz]|t[cdfghjklmnortvwz]|u[agksyz]|v[aceginu]|w[fs]|y[et]|z[amw]",b3="biz|com|edu|gov|net|org|pro|web|xxx|aero|asia|coop|info|museum|name|shop|рф".split("|");function g3(e){e.__index__=-1,e.__text_cache__=""}function _3(e){return function(t,u){const n=t.slice(u);return e.test(n)?n.match(e)[0].length:0}}function gl(){return function(e,t){t.normalize(e)}}function Qr(e){const t=e.re=c3(e.__opts__),u=e.__tlds__.slice();e.onCompile(),e.__tlds_replaced__||u.push(m3),u.push(t.src_xn),t.src_tlds=u.join("|");function n(o){return o.replace("%TLDS%",t.src_tlds)}t.email_fuzzy=RegExp(n(t.tpl_email_fuzzy),"i"),t.link_fuzzy=RegExp(n(t.tpl_link_fuzzy),"i"),t.link_no_ip_fuzzy=RegExp(n(t.tpl_link_no_ip_fuzzy),"i"),t.host_fuzzy_test=RegExp(n(t.tpl_host_fuzzy_test),"i");const r=[];e.__compiled__={};function i(o,c){throw new Error('(LinkifyIt) Invalid schema "'+o+'": '+c)}Object.keys(e.__schemas__).forEach(function(o){const c=e.__schemas__[o];if(c===null)return;const l={validate:null,link:null};if(e.__compiled__[o]=l,a3(c)){f3(c.validate)?l.validate=_3(c.validate):bl(c.validate)?l.validate=c.validate:i(o,c),bl(c.normalize)?l.normalize=c.normalize:c.normalize?i(o,c):l.normalize=gl();return}if(l3(c)){r.push(o);return}i(o,c)}),r.forEach(function(o){e.__compiled__[e.__schemas__[o]]&&(e.__compiled__[o].validate=e.__compiled__[e.__schemas__[o]].validate,e.__compiled__[o].normalize=e.__compiled__[e.__schemas__[o]].normalize)}),e.__compiled__[""]={validate:null,normalize:gl()};const s=Object.keys(e.__compiled__).filter(function(o){return o.length>0&&e.__compiled__[o]}).map(d3).join("|");e.re.schema_test=RegExp("(^|(?!_)(?:[><｜]|"+t.src_ZPCc+"))("+s+")","i"),e.re.schema_search=RegExp("(^|(?!_)(?:[><｜]|"+t.src_ZPCc+"))("+s+")","ig"),e.re.schema_at_start=RegExp("^"+e.re.schema_search.source,"i"),e.re.pretest=RegExp("("+e.re.schema_test.source+")|("+e.re.host_fuzzy_test.source+")|@","i"),g3(e)}function y3(e,t){const u=e.__index__,n=e.__last_index__,r=e.__text_cache__.slice(u,n);this.schema=e.__schema__.toLowerCase(),this.index=u+t,this.lastIndex=n+t,this.raw=r,this.text=r,this.url=r}function Ns(e,t){const u=new y3(e,t);return e.__compiled__[u.schema].normalize(u,e),u}function nt(e,t){if(!(this instanceof nt))return new nt(e,t);t||h3(e)&&(t=e,e={}),this.__opts__=Os({},Df,t),this.__index__=-1,this.__last_index__=-1,this.__schema__="",this.__text_cache__="",this.__schemas__=Os({},p3,e),this.__compiled__={},this.__tlds__=b3,this.__tlds_replaced__=!1,this.re={},Qr(this)}nt.prototype.add=function(t,u){return this.__schemas__[t]=u,Qr(this),this};nt.prototype.set=function(t){return this.__opts__=Os(this.__opts__,t),this};nt.prototype.test=function(t){if(this.__text_cache__=t,this.__index__=-1,!t.length)return!1;let u,n,r,i,s,o,c,l,a;if(this.re.schema_test.test(t)){for(c=this.re.schema_search,c.lastIndex=0;(u=c.exec(t))!==null;)if(i=this.testSchemaAt(t,u[2],c.lastIndex),i){this.__schema__=u[2],this.__index__=u.index+u[1].length,this.__last_index__=u.index+u[0].length+i;break}}return this.__opts__.fuzzyLink&&this.__compiled__["http:"]&&(l=t.search(this.re.host_fuzzy_test),l>=0&&(this.__index__<0||l<this.__index__)&&(n=t.match(this.__opts__.fuzzyIP?this.re.link_fuzzy:this.re.link_no_ip_fuzzy))!==null&&(s=n.index+n[1].length,(this.__index__<0||s<this.__index__)&&(this.__schema__="",this.__index__=s,this.__last_index__=n.index+n[0].length))),this.__opts__.fuzzyEmail&&this.__compiled__["mailto:"]&&(a=t.indexOf("@"),a>=0&&(r=t.match(this.re.email_fuzzy))!==null&&(s=r.index+r[1].length,o=r.index+r[0].length,(this.__index__<0||s<this.__index__||s===this.__index__&&o>this.__last_index__)&&(this.__schema__="mailto:",this.__index__=s,this.__last_index__=o))),this.__index__>=0};nt.prototype.pretest=function(t){return this.re.pretest.test(t)};nt.prototype.testSchemaAt=function(t,u,n){return this.__compiled__[u.toLowerCase()]?this.__compiled__[u.toLowerCase()].validate(t,n,this):0};nt.prototype.match=function(t){const u=[];let n=0;this.__index__>=0&&this.__text_cache__===t&&(u.push(Ns(this,n)),n=this.__last_index__);let r=n?t.slice(n):t;for(;this.test(r);)u.push(Ns(this,n)),r=r.slice(this.__last_index__),n+=this.__last_index__;return u.length?u:null};nt.prototype.matchAtStart=function(t){if(this.__text_cache__=t,this.__index__=-1,!t.length)return null;const u=this.re.schema_at_start.exec(t);if(!u)return null;const n=this.testSchemaAt(t,u[2],u[0].length);return n?(this.__schema__=u[2],this.__index__=u.index+u[1].length,this.__last_index__=u.index+u[0].length+n,Ns(this,0)):null};nt.prototype.tlds=function(t,u){return t=Array.isArray(t)?t:[t],u?(this.__tlds__=this.__tlds__.concat(t).sort().filter(function(n,r,i){return n!==i[r-1]}).reverse(),Qr(this),this):(this.__tlds__=t.slice(),this.__tlds_replaced__=!0,Qr(this),this)};nt.prototype.normalize=function(t){t.schema||(t.url="http://"+t.url),t.schema==="mailto:"&&!/^mailto:/i.test(t.url)&&(t.url="mailto:"+t.url)};nt.prototype.onCompile=function(){};const qu=2147483647,At=36,Ho=1,zn=26,x3=38,E3=700,Tf=72,wf=128,Ff="-",C3=/^xn--/,k3=/[^\0-\x7F]/,A3=/[\x2E\u3002\uFF0E\uFF61]/g,S3={overflow:"Overflow: input needs wider integers to process","not-basic":"Illegal input >= 0x80 (not a basic code point)","invalid-input":"Invalid input"},Qi=At-Ho,St=Math.floor,es=String.fromCharCode;function Xt(e){throw new RangeError(S3[e])}function v3(e,t){const u=[];let n=e.length;for(;n--;)u[n]=t(e[n]);return u}function If(e,t){const u=e.split("@");let n="";u.length>1&&(n=u[0]+"@",e=u[1]),e=e.replace(A3,".");const r=e.split("."),i=v3(r,t).join(".");return n+i}function Of(e){const t=[];let u=0;const n=e.length;for(;u<n;){const r=e.charCodeAt(u++);if(r>=55296&&r<=56319&&u<n){const i=e.charCodeAt(u++);(i&64512)==56320?t.push(((r&1023)<<10)+(i&1023)+65536):(t.push(r),u--)}else t.push(r)}return t}const D3=e=>String.fromCodePoint(...e),T3=function(e){return e>=48&&e<58?26+(e-48):e>=65&&e<91?e-65:e>=97&&e<123?e-97:At},_l=function(e,t){return e+22+75*(e<26)-((t!=0)<<5)},Nf=function(e,t,u){let n=0;for(e=u?St(e/E3):e>>1,e+=St(e/t);e>Qi*zn>>1;n+=At)e=St(e/Qi);return St(n+(Qi+1)*e/(e+x3))},Rf=function(e){const t=[],u=e.length;let n=0,r=wf,i=Tf,s=e.lastIndexOf(Ff);s<0&&(s=0);for(let o=0;o<s;++o)e.charCodeAt(o)>=128&&Xt("not-basic"),t.push(e.charCodeAt(o));for(let o=s>0?s+1:0;o<u;){const c=n;for(let a=1,f=At;;f+=At){o>=u&&Xt("invalid-input");const p=T3(e.charCodeAt(o++));p>=At&&Xt("invalid-input"),p>St((qu-n)/a)&&Xt("overflow"),n+=p*a;const d=f<=i?Ho:f>=i+zn?zn:f-i;if(p<d)break;const h=At-d;a>St(qu/h)&&Xt("overflow"),a*=h}const l=t.length+1;i=Nf(n-c,l,c==0),St(n/l)>qu-r&&Xt("overflow"),r+=St(n/l),n%=l,t.splice(n++,0,r)}return String.fromCodePoint(...t)},Mf=function(e){const t=[];e=Of(e);const u=e.length;let n=wf,r=0,i=Tf;for(const c of e)c<128&&t.push(es(c));const s=t.length;let o=s;for(s&&t.push(Ff);o<u;){let c=qu;for(const a of e)a>=n&&a<c&&(c=a);const l=o+1;c-n>St((qu-r)/l)&&Xt("overflow"),r+=(c-n)*l,n=c;for(const a of e)if(a<n&&++r>qu&&Xt("overflow"),a===n){let f=r;for(let p=At;;p+=At){const d=p<=i?Ho:p>=i+zn?zn:p-i;if(f<d)break;const h=f-d,_=At-d;t.push(es(_l(d+h%_,0))),f=St(h/_)}t.push(es(_l(f,0))),i=Nf(r,l,o===s),r=0,++o}++r,++n}return t.join("")},w3=function(e){return If(e,function(t){return C3.test(t)?Rf(t.slice(4).toLowerCase()):t})},F3=function(e){return If(e,function(t){return k3.test(t)?"xn--"+Mf(t):t})},Pf={version:"2.3.1",ucs2:{decode:Of,encode:D3},decode:Rf,encode:Mf,toASCII:F3,toUnicode:w3},I3={options:{html:!1,xhtmlOut:!1,breaks:!1,langPrefix:"language-",linkify:!1,typographer:!1,quotes:"“”‘’",highlight:null,maxNesting:100},components:{core:{},block:{},inline:{}}},O3={options:{html:!1,xhtmlOut:!1,breaks:!1,langPrefix:"language-",linkify:!1,typographer:!1,quotes:"“”‘’",highlight:null,maxNesting:20},components:{core:{rules:["normalize","block","inline","text_join"]},block:{rules:["paragraph"]},inline:{rules:["text"],rules2:["balance_pairs","fragments_join"]}}},N3={options:{html:!0,xhtmlOut:!0,breaks:!1,langPrefix:"language-",linkify:!1,typographer:!1,quotes:"“”‘’",highlight:null,maxNesting:20},components:{core:{rules:["normalize","block","inline","text_join"]},block:{rules:["blockquote","code","fence","heading","hr","html_block","lheading","list","reference","paragraph"]},inline:{rules:["autolink","backticks","emphasis","entity","escape","html_inline","image","link","newline","text"],rules2:["balance_pairs","emphasis","fragments_join"]}}},R3={default:I3,zero:O3,commonmark:N3},M3=/^(vbscript|javascript|file|data):/,P3=/^data:image\/(gif|png|jpeg|webp);/;function L3(e){const t=e.trim().toLowerCase();return M3.test(t)?P3.test(t):!0}const Lf=["http:","https:","mailto:"];function B3(e){const t=Mo(e,!0);if(t.hostname&&(!t.protocol||Lf.indexOf(t.protocol)>=0))try{t.hostname=Pf.toASCII(t.hostname)}catch{}return er(Ro(t))}function V3(e){const t=Mo(e,!0);if(t.hostname&&(!t.protocol||Lf.indexOf(t.protocol)>=0))try{t.hostname=Pf.toUnicode(t.hostname)}catch{}return Qu(Ro(t),Qu.defaultChars+"%")}function at(e,t){if(!(this instanceof at))return new at(e,t);t||Lo(e)||(t=e||{},e="default"),this.inline=new ur,this.block=new Ti,this.core=new Vo,this.renderer=new rn,this.linkify=new nt,this.validateLink=L3,this.normalizeLink=B3,this.normalizeLinkText=V3,this.utils=Hb,this.helpers=vi({},zb),this.options={},this.configure(e),t&&this.set(t)}at.prototype.set=function(e){return vi(this.options,e),this};at.prototype.configure=function(e){const t=this;if(Lo(e)){const u=e;if(e=R3[u],!e)throw new Error('Wrong `markdown-it` preset "'+u+'", check name')}if(!e)throw new Error("Wrong `markdown-it` preset, can't be empty");return e.options&&t.set(e.options),e.components&&Object.keys(e.components).forEach(function(u){e.components[u].rules&&t[u].ruler.enableOnly(e.components[u].rules),e.components[u].rules2&&t[u].ruler2.enableOnly(e.components[u].rules2)}),this};at.prototype.enable=function(e,t){let u=[];Array.isArray(e)||(e=[e]),["core","block","inline"].forEach(function(r){u=u.concat(this[r].ruler.enable(e,!0))},this),u=u.concat(this.inline.ruler2.enable(e,!0));const n=e.filter(function(r){return u.indexOf(r)<0});if(n.length&&!t)throw new Error("MarkdownIt. Failed to enable unknown rule(s): "+n);return this};at.prototype.disable=function(e,t){let u=[];Array.isArray(e)||(e=[e]),["core","block","inline"].forEach(function(r){u=u.concat(this[r].ruler.disable(e,!0))},this),u=u.concat(this.inline.ruler2.disable(e,!0));const n=e.filter(function(r){return u.indexOf(r)<0});if(n.length&&!t)throw new Error("MarkdownIt. Failed to disable unknown rule(s): "+n);return this};at.prototype.use=function(e){const t=[this].concat(Array.prototype.slice.call(arguments,1));return e.apply(e,t),this};at.prototype.parse=function(e,t){if(typeof e!="string")throw new Error("Input data should be a String");const u=new this.core.State(e,this,t);return this.core.process(u),u.tokens};at.prototype.render=function(e,t){return t=t||{},this.renderer.render(this.parse(e,t),this.options,t)};at.prototype.parseInline=function(e,t){const u=new this.core.State(e,this,t);return u.inlineMode=!0,this.core.process(u),u.tokens};at.prototype.renderInline=function(e,t){return t=t||{},this.renderer.render(this.parseInline(e,t),this.options,t)};const $3=e=>!!e.toLowerCase,H3=e=>!$3(e)&&!isNaN(e),U3=e=>e.replace(/>/g,"&gt;").replace(/"/g,"&quot;"),Bf=e=>navigator.clipboard.writeText(e),q3=localStorage.getItem("base")||new URL(".",document.baseURI).href.toString(),ei={apiKey:"",systemMessage:"You are a helpful assistant.",samplers:"dkypmxt",temperature:.8,dynatemp_range:0,dynatemp_exponent:1,top_k:40,top_p:.95,min_p:.05,xtc_probability:0,xtc_threshold:.1,typical_p:1,repeat_last_n:64,repeat_penalty:1,presence_penalty:0,frequency_penalty:0,dry_multiplier:0,dry_base:1.75,dry_allowed_length:2,dry_penalty_last_n:-1,max_tokens:-1,custom:""},j3={apiKey:"Set the API Key if you are using --api-key option for the server.",systemMessage:"The starting message that defines how model should behave.",samplers:'The order at which samplers are applied, in simplified way. Default is "dkypmxt": dry->top_k->typ_p->top_p->min_p->xtc->temperature',temperature:"Controls the randomness of the generated text by affecting the probability distribution of the output tokens. Higher = more random, lower = more focused.",dynatemp_range:"Addon for the temperature sampler. The added value to the range of dynamic temperature, which adjusts probabilities by entropy of tokens.",dynatemp_exponent:"Addon for the temperature sampler. Smoothes out the probability redistribution based on the most probable token.",top_k:"Keeps only k top tokens.",top_p:"Limits tokens to those that together have a cumulative probability of at least p",min_p:"Limits tokens based on the minimum probability for a token to be considered, relative to the probability of the most likely token.",xtc_probability:"XTC sampler cuts out top tokens; this parameter controls the chance of cutting tokens at all. 0 disables XTC.",xtc_threshold:"XTC sampler cuts out top tokens; this parameter controls the token probability that is required to cut that token.",typical_p:"Sorts and limits tokens based on the difference between log-probability and entropy.",repeat_last_n:"Last n tokens to consider for penalizing repetition",repeat_penalty:"Controls the repetition of token sequences in the generated text",presence_penalty:"Limits tokens based on whether they appear in the output or not.",frequency_penalty:"Limits tokens based on how often they appear in the output.",dry_multiplier:"DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling multiplier.",dry_base:"DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling base value.",dry_allowed_length:"DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the allowed length for DRY sampling.",dry_penalty_last_n:"DRY sampling reduces repetition in generated text even across long contexts. This parameter sets DRY penalty for the last n tokens.",max_tokens:"The maximum number of token per output.",custom:""},z3=Object.entries(ei).filter(e=>H3(e[1])).map(e=>e[0]),K3=["light","dark","cupcake","bumblebee","emerald","corporate","synthwave","retro","cyberpunk","valentine","halloween","garden","forest","aqua","lofi","pastel","fantasy","wireframe","black","luxury","dracula","cmyk","autumn","business","acid","lemonade","night","coffee","winter","dim","nord","sunset"],W3=Kn(e=>{const t=js(new at({breaks:!0})),u=t.value.renderer.rules.fence;t.value.renderer.rules.fence=(r,i,...s)=>{const o=r[i].content,c=u(r,i,...s);return`<div class="relative my-4">
-        <div class="text-right sticky top-4 mb-2 mr-2 h-0">
-          <button class="badge btn-mini" onclick="copyStr(${U3(JSON.stringify(o))})">📋 Copy</button>
-        </div>
-        ${c}
-      </div>`},window.copyStr=Bf;const n=lo(()=>t.value.render(e.source));return()=>ao("div",{innerHTML:n.value})},{props:["source"]}),G3=Kn({template:document.getElementById("settings-modal-short-input").innerHTML,props:{label:{type:String,required:!1},configKey:String,configDefault:Object,configInfo:Object,modelValue:[Object,String,Number]}}),he={getAllConversations(){const e=[];for(const t in localStorage)t.startsWith("conv-")&&e.push(JSON.parse(localStorage.getItem(t)));return e.sort((t,u)=>u.lastModified-t.lastModified),e},getOneConversation(e){return JSON.parse(localStorage.getItem(e)||"null")},appendMsg(e,t){if(t.content===null)return;const u=he.getOneConversation(e)||{id:e,lastModified:Date.now(),messages:[]};u.messages.push(t),u.lastModified=Date.now(),localStorage.setItem(e,JSON.stringify(u))},getNewConvId(){return`conv-${Date.now()}`},remove(e){localStorage.removeItem(e)},filterAndKeepMsgs(e,t){const u=he.getOneConversation(e);u&&(u.messages=u.messages.filter(t),u.lastModified=Date.now(),localStorage.setItem(e,JSON.stringify(u)))},popMsg(e){const t=he.getOneConversation(e);if(!t)return;const u=t.messages.pop();return t.lastModified=Date.now(),t.messages.length===0?he.remove(e):localStorage.setItem(e,JSON.stringify(t)),u},getConfig(){const e=JSON.parse(localStorage.getItem("config")||"{}");return{...ei,...e}},setConfig(e){localStorage.setItem("config",JSON.stringify(e))},getTheme(){return localStorage.getItem("theme")||"auto"},setTheme(e){e==="auto"?localStorage.removeItem("theme"):localStorage.setItem("theme",e)}},fn=e=>{const t=document.getElementById("messages-list"),u=t.scrollHeight-t.scrollTop-t.clientHeight;(!e||u<100)&&setTimeout(()=>t.scrollTo({top:t.scrollHeight}),1)},Vf=Ur({components:{VueMarkdown:W3,SettingsModalShortInput:G3},data(){return{conversations:he.getAllConversations(),messages:[],viewingConvId:he.getNewConvId(),inputMsg:"",isGenerating:!1,pendingMsg:null,stopGeneration:()=>{},selectedTheme:he.getTheme(),config:he.getConfig(),showConfigDialog:!1,editingMsg:null,themes:K3,configDefault:{...ei},configInfo:{...j3}}},computed:{},mounted(){document.getElementById("app").classList.remove("opacity-0");const e=document.getElementById("pending-msg");new ResizeObserver(()=>{this.isGenerating&&fn(!0)}).observe(e)},methods:{hideSidebar(){document.getElementById("toggle-drawer").checked=!1},setSelectedTheme(e){this.selectedTheme=e,he.setTheme(e)},newConversation(){this.isGenerating||(this.viewingConvId=he.getNewConvId(),this.editingMsg=null,this.fetchMessages(),fn(),this.hideSidebar())},setViewingConv(e){this.isGenerating||(this.viewingConvId=e,this.editingMsg=null,this.fetchMessages(),fn(),this.hideSidebar())},deleteConv(e){this.isGenerating||window.confirm("Are you sure to delete this conversation?")&&(he.remove(e),this.viewingConvId===e&&(this.viewingConvId=he.getNewConvId(),this.editingMsg=null),this.fetchConversation(),this.fetchMessages())},downloadConv(e){const t=he.getOneConversation(e);if(!t){alert("Conversation not found.");return}const u=JSON.stringify(t,null,2),n=new Blob([u],{type:"application/json"}),r=URL.createObjectURL(n),i=document.createElement("a");i.href=r,i.download=`conversation_${e}.json`,document.body.appendChild(i),i.click(),document.body.removeChild(i),URL.revokeObjectURL(r)},async sendMessage(){if(!this.inputMsg)return;const e=this.viewingConvId;he.appendMsg(e,{id:Date.now(),role:"user",content:this.inputMsg}),this.fetchConversation(),this.fetchMessages(),this.inputMsg="",this.editingMsg=null,this.generateMessage(e),fn()},async generateMessage(e){if(!this.isGenerating){this.pendingMsg={id:Date.now()+1,role:"assistant",content:null},this.isGenerating=!0,this.editingMsg=null;try{const t=new AbortController;this.stopGeneration=()=>t.abort();const u={messages:[{role:"system",content:this.config.systemMessage},...this.messages],stream:!0,cache_prompt:!0,samplers:this.config.samplers,temperature:this.config.temperature,dynatemp_range:this.config.dynatemp_range,dynatemp_exponent:this.config.dynatemp_exponent,top_k:this.config.top_k,top_p:this.config.top_p,min_p:this.config.min_p,typical_p:this.config.typical_p,xtc_probability:this.config.xtc_probability,xtc_threshold:this.config.xtc_threshold,repeat_last_n:this.config.repeat_last_n,repeat_penalty:this.config.repeat_penalty,presence_penalty:this.config.presence_penalty,frequency_penalty:this.config.frequency_penalty,dry_multiplier:this.config.dry_multiplier,dry_base:this.config.dry_base,dry_allowed_length:this.config.dry_allowed_length,dry_penalty_last_n:this.config.dry_penalty_last_n,max_tokens:this.config.max_tokens,...this.config.custom.length?JSON.parse(this.config.custom):{},...this.config.apiKey?{api_key:this.config.apiKey}:{}},n={controller:t,api_url:q3,endpoint:"/chat/completions"};for await(const r of eb(prompt,u,n)){const i=r.data.stop,s=r.data.choices[0].delta.content,o=this.pendingMsg.content||"";s&&(this.pendingMsg={id:this.pendingMsg.id,role:"assistant",content:o+s})}he.appendMsg(e,this.pendingMsg),this.fetchConversation(),this.fetchMessages(),setTimeout(()=>document.getElementById("msg-input").focus(),1)}catch(t){if(t.name==="AbortError")he.appendMsg(e,this.pendingMsg),this.fetchConversation(),this.fetchMessages();else{console.error(t),alert(t);const u=he.popMsg(e);this.inputMsg=u?u.content:""}}this.pendingMsg=null,this.isGenerating=!1,this.stopGeneration=()=>{},this.fetchMessages(),fn()}},regenerateMsg(e){if(this.isGenerating)return;const t=this.viewingConvId;he.filterAndKeepMsgs(t,u=>u.id<e.id),this.fetchConversation(),this.fetchMessages(),this.generateMessage(t)},copyMsg(e){Bf(e.content)},editUserMsgAndRegenerate(e){if(this.isGenerating)return;const t=this.viewingConvId,u=e.content;this.editingMsg=null,he.filterAndKeepMsgs(t,n=>n.id<e.id),he.appendMsg(t,{id:Date.now(),role:"user",content:u}),this.fetchConversation(),this.fetchMessages(),this.generateMessage(t)},closeAndSaveConfigDialog(){try{this.config.custom.length&&JSON.parse(this.config.custom)}catch{alert("Invalid JSON for custom config. Please either fix it or leave it empty.");return}for(const e of z3){if(isNaN(this.config[e])||this.config[e].toString().trim().length===0){alert(`Invalid number for ${e} (expected an integer or a float)`);return}this.config[e]=parseFloat(this.config[e])}this.showConfigDialog=!1,he.setConfig(this.config)},closeAndDiscardConfigDialog(){this.showConfigDialog=!1,this.config=he.getConfig()},resetConfigDialog(){window.confirm("Are you sure to reset all settings?")&&(this.config={...ei})},fetchConversation(){this.conversations=he.getAllConversations()},fetchMessages(){var e;this.messages=((e=he.getOneConversation(this.viewingConvId))==null?void 0:e.messages)??[]}}});Vf.config.errorHandler=alert;try{Vf.mount("#app")}catch(e){console.error(e),document.getElementById("app").innerHTML=`<div style="margin:2em auto">
-    Failed to start app. Please try clearing localStorage and try again.<br/>
-    <br/>
-    <button class="btn" onClick="localStorage.clear(); window.location.reload();">Clear localStorage</button>
-  </div>`}</script>
-  <style rel="stylesheet" crossorigin>*,:before,:after{--tw-border-spacing-x: 0;--tw-border-spacing-y: 0;--tw-translate-x: 0;--tw-translate-y: 0;--tw-rotate: 0;--tw-skew-x: 0;--tw-skew-y: 0;--tw-scale-x: 1;--tw-scale-y: 1;--tw-pan-x: ;--tw-pan-y: ;--tw-pinch-zoom: ;--tw-scroll-snap-strictness: proximity;--tw-gradient-from-position: ;--tw-gradient-via-position: ;--tw-gradient-to-position: ;--tw-ordinal: ;--tw-slashed-zero: ;--tw-numeric-figure: ;--tw-numeric-spacing: ;--tw-numeric-fraction: ;--tw-ring-inset: ;--tw-ring-offset-width: 0px;--tw-ring-offset-color: #fff;--tw-ring-color: rgb(59 130 246 / .5);--tw-ring-offset-shadow: 0 0 #0000;--tw-ring-shadow: 0 0 #0000;--tw-shadow: 0 0 #0000;--tw-shadow-colored: 0 0 #0000;--tw-blur: ;--tw-brightness: ;--tw-contrast: ;--tw-grayscale: ;--tw-hue-rotate: ;--tw-invert: ;--tw-saturate: ;--tw-sepia: ;--tw-drop-shadow: ;--tw-backdrop-blur: ;--tw-backdrop-brightness: ;--tw-backdrop-contrast: ;--tw-backdrop-grayscale: ;--tw-backdrop-hue-rotate: ;--tw-backdrop-invert: ;--tw-backdrop-opacity: ;--tw-backdrop-saturate: ;--tw-backdrop-sepia: ;--tw-contain-size: ;--tw-contain-layout: ;--tw-contain-paint: ;--tw-contain-style: }::backdrop{--tw-border-spacing-x: 0;--tw-border-spacing-y: 0;--tw-translate-x: 0;--tw-translate-y: 0;--tw-rotate: 0;--tw-skew-x: 0;--tw-skew-y: 0;--tw-scale-x: 1;--tw-scale-y: 1;--tw-pan-x: ;--tw-pan-y: ;--tw-pinch-zoom: ;--tw-scroll-snap-strictness: proximity;--tw-gradient-from-position: ;--tw-gradient-via-position: ;--tw-gradient-to-position: ;--tw-ordinal: ;--tw-slashed-zero: ;--tw-numeric-figure: ;--tw-numeric-spacing: ;--tw-numeric-fraction: ;--tw-ring-inset: ;--tw-ring-offset-width: 0px;--tw-ring-offset-color: #fff;--tw-ring-color: rgb(59 130 246 / .5);--tw-ring-offset-shadow: 0 0 #0000;--tw-ring-shadow: 0 0 #0000;--tw-shadow: 0 0 #0000;--tw-shadow-colored: 0 0 #0000;--tw-blur: ;--tw-brightness: ;--tw-contrast: ;--tw-grayscale: ;--tw-hue-rotate: ;--tw-invert: ;--tw-saturate: ;--tw-sepia: ;--tw-drop-shadow: ;--tw-backdrop-blur: ;--tw-backdrop-brightness: ;--tw-backdrop-contrast: ;--tw-backdrop-grayscale: ;--tw-backdrop-hue-rotate: ;--tw-backdrop-invert: ;--tw-backdrop-opacity: ;--tw-backdrop-saturate: ;--tw-backdrop-sepia: ;--tw-contain-size: ;--tw-contain-layout: ;--tw-contain-paint: ;--tw-contain-style: }*,:before,:after{box-sizing:border-box;border-width:0;border-style:solid;border-color:#e5e7eb}:before,:after{--tw-content: ""}html,:host{line-height:1.5;-webkit-text-size-adjust:100%;-moz-tab-size:4;-o-tab-size:4;tab-size:4;font-family:ui-sans-serif,system-ui,sans-serif,"Apple Color Emoji","Segoe UI Emoji",Segoe UI Symbol,"Noto Color Emoji";font-feature-settings:normal;font-variation-settings:normal;-webkit-tap-highlight-color:transparent}body{margin:0;line-height:inherit}hr{height:0;color:inherit;border-top-width:1px}abbr:where([title]){-webkit-text-decoration:underline dotted;text-decoration:underline dotted}h1,h2,h3,h4,h5,h6{font-size:inherit;font-weight:inherit}a{color:inherit;text-decoration:inherit}b,strong{font-weight:bolder}code,kbd,samp,pre{font-family:ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace;font-feature-settings:normal;font-variation-settings:normal;font-size:1em}small{font-size:80%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}table{text-indent:0;border-color:inherit;border-collapse:collapse}button,input,optgroup,select,textarea{font-family:inherit;font-feature-settings:inherit;font-variation-settings:inherit;font-size:100%;font-weight:inherit;line-height:inherit;letter-spacing:inherit;color:inherit;margin:0;padding:0}button,select{text-transform:none}button,input:where([type=button]),input:where([type=reset]),input:where([type=submit]){-webkit-appearance:button;background-color:transparent;background-image:none}:-moz-focusring{outline:auto}:-moz-ui-invalid{box-shadow:none}progress{vertical-align:baseline}::-webkit-inner-spin-button,::-webkit-outer-spin-button{height:auto}[type=search]{-webkit-appearance:textfield;outline-offset:-2px}::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{-webkit-appearance:button;font:inherit}summary{display:list-item}blockquote,dl,dd,h1,h2,h3,h4,h5,h6,hr,figure,p,pre{margin:0}fieldset{margin:0;padding:0}legend{padding:0}ol,ul,menu{list-style:none;margin:0;padding:0}dialog{padding:0}textarea{resize:vertical}input::-moz-placeholder,textarea::-moz-placeholder{opacity:1;color:#9ca3af}input::placeholder,textarea::placeholder{opacity:1;color:#9ca3af}button,[role=button]{cursor:pointer}:disabled{cursor:default}img,svg,video,canvas,audio,iframe,embed,object{display:block;vertical-align:middle}img,video{max-width:100%;height:auto}[hidden]:where(:not([hidden=until-found])){display:none}:root,[data-theme]{background-color:var(--fallback-b1,oklch(var(--b1)/1));color:var(--fallback-bc,oklch(var(--bc)/1))}@supports not (color: oklch(0% 0 0)){:root{color-scheme:light;--fallback-p: #491eff;--fallback-pc: #d4dbff;--fallback-s: #ff41c7;--fallback-sc: #fff9fc;--fallback-a: #00cfbd;--fallback-ac: #00100d;--fallback-n: #2b3440;--fallback-nc: #d7dde4;--fallback-b1: #ffffff;--fallback-b2: #e5e6e6;--fallback-b3: #e5e6e6;--fallback-bc: #1f2937;--fallback-in: #00b3f0;--fallback-inc: #000000;--fallback-su: #00ca92;--fallback-suc: #000000;--fallback-wa: #ffc22d;--fallback-wac: #000000;--fallback-er: #ff6f70;--fallback-erc: #000000}@media (prefers-color-scheme: dark){:root{color-scheme:dark;--fallback-p: #7582ff;--fallback-pc: #050617;--fallback-s: #ff71cf;--fallback-sc: #190211;--fallback-a: #00c7b5;--fallback-ac: #000e0c;--fallback-n: #2a323c;--fallback-nc: #a6adbb;--fallback-b1: #1d232a;--fallback-b2: #191e24;--fallback-b3: #15191e;--fallback-bc: #a6adbb;--fallback-in: #00b3f0;--fallback-inc: #000000;--fallback-su: #00ca92;--fallback-suc: #000000;--fallback-wa: #ffc22d;--fallback-wac: #000000;--fallback-er: #ff6f70;--fallback-erc: #000000}}}html{-webkit-tap-highlight-color:transparent}*{scrollbar-color:color-mix(in oklch,currentColor 35%,transparent) transparent}*:hover{scrollbar-color:color-mix(in oklch,currentColor 60%,transparent) transparent}:root{color-scheme:light;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--pc: 89.824% .06192 275.75;--ac: 15.352% .0368 183.61;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 49.12% .3096 275.75;--s: 69.71% .329 342.55;--sc: 98.71% .0106 342.55;--a: 76.76% .184 183.61;--n: 32.1785% .02476 255.701624;--nc: 89.4994% .011585 252.096176;--b1: 100% 0 0;--b2: 96.1151% 0 0;--b3: 92.4169% .00108 197.137559;--bc: 27.8078% .029596 256.847952}@media (prefers-color-scheme: dark){:root{color-scheme:dark;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--pc: 13.138% .0392 275.75;--sc: 14.96% .052 342.55;--ac: 14.902% .0334 183.61;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 65.69% .196 275.75;--s: 74.8% .26 342.55;--a: 74.51% .167 183.61;--n: 31.3815% .021108 254.139175;--nc: 74.6477% .0216 264.435964;--b1: 25.3267% .015896 252.417568;--b2: 23.2607% .013807 253.100675;--b3: 21.1484% .01165 254.087939;--bc: 74.6477% .0216 264.435964}}[data-theme=light]{color-scheme:light;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--pc: 89.824% .06192 275.75;--ac: 15.352% .0368 183.61;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 49.12% .3096 275.75;--s: 69.71% .329 342.55;--sc: 98.71% .0106 342.55;--a: 76.76% .184 183.61;--n: 32.1785% .02476 255.701624;--nc: 89.4994% .011585 252.096176;--b1: 100% 0 0;--b2: 96.1151% 0 0;--b3: 92.4169% .00108 197.137559;--bc: 27.8078% .029596 256.847952}:root:has(input.theme-controller[value=light]:checked){color-scheme:light;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--pc: 89.824% .06192 275.75;--ac: 15.352% .0368 183.61;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 49.12% .3096 275.75;--s: 69.71% .329 342.55;--sc: 98.71% .0106 342.55;--a: 76.76% .184 183.61;--n: 32.1785% .02476 255.701624;--nc: 89.4994% .011585 252.096176;--b1: 100% 0 0;--b2: 96.1151% 0 0;--b3: 92.4169% .00108 197.137559;--bc: 27.8078% .029596 256.847952}[data-theme=dark]{color-scheme:dark;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--pc: 13.138% .0392 275.75;--sc: 14.96% .052 342.55;--ac: 14.902% .0334 183.61;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 65.69% .196 275.75;--s: 74.8% .26 342.55;--a: 74.51% .167 183.61;--n: 31.3815% .021108 254.139175;--nc: 74.6477% .0216 264.435964;--b1: 25.3267% .015896 252.417568;--b2: 23.2607% .013807 253.100675;--b3: 21.1484% .01165 254.087939;--bc: 74.6477% .0216 264.435964}:root:has(input.theme-controller[value=dark]:checked){color-scheme:dark;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--pc: 13.138% .0392 275.75;--sc: 14.96% .052 342.55;--ac: 14.902% .0334 183.61;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 65.69% .196 275.75;--s: 74.8% .26 342.55;--a: 74.51% .167 183.61;--n: 31.3815% .021108 254.139175;--nc: 74.6477% .0216 264.435964;--b1: 25.3267% .015896 252.417568;--b2: 23.2607% .013807 253.100675;--b3: 21.1484% .01165 254.087939;--bc: 74.6477% .0216 264.435964}[data-theme=cupcake]{color-scheme:light;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--pc: 15.2344% .017892 200.026556;--sc: 15.787% .020249 356.29965;--ac: 15.8762% .029206 78.618794;--nc: 84.7148% .013247 313.189598;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--rounded-box: 1rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--p: 76.172% .089459 200.026556;--s: 78.9351% .101246 356.29965;--a: 79.3811% .146032 78.618794;--n: 23.5742% .066235 313.189598;--b1: 97.7882% .00418 56.375637;--b2: 93.9822% .007638 61.449292;--b3: 91.5861% .006811 53.440502;--bc: 23.5742% .066235 313.189598;--rounded-btn: 1.9rem;--tab-border: 2px;--tab-radius: .7rem}:root:has(input.theme-controller[value=cupcake]:checked){color-scheme:light;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--pc: 15.2344% .017892 200.026556;--sc: 15.787% .020249 356.29965;--ac: 15.8762% .029206 78.618794;--nc: 84.7148% .013247 313.189598;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--rounded-box: 1rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--p: 76.172% .089459 200.026556;--s: 78.9351% .101246 356.29965;--a: 79.3811% .146032 78.618794;--n: 23.5742% .066235 313.189598;--b1: 97.7882% .00418 56.375637;--b2: 93.9822% .007638 61.449292;--b3: 91.5861% .006811 53.440502;--bc: 23.5742% .066235 313.189598;--rounded-btn: 1.9rem;--tab-border: 2px;--tab-radius: .7rem}[data-theme=bumblebee]{color-scheme:light;--b2: 93% 0 0;--b3: 86% 0 0;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--bc: 20% 0 0;--ac: 16.254% .0314 56.52;--nc: 82.55% .015 281.99;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 89.51% .2132 96.61;--pc: 38.92% .046 96.61;--s: 80.39% .194 70.76;--sc: 39.38% .068 70.76;--a: 81.27% .157 56.52;--n: 12.75% .075 281.99;--b1: 100% 0 0}:root:has(input.theme-controller[value=bumblebee]:checked){color-scheme:light;--b2: 93% 0 0;--b3: 86% 0 0;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--bc: 20% 0 0;--ac: 16.254% .0314 56.52;--nc: 82.55% .015 281.99;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 89.51% .2132 96.61;--pc: 38.92% .046 96.61;--s: 80.39% .194 70.76;--sc: 39.38% .068 70.76;--a: 81.27% .157 56.52;--n: 12.75% .075 281.99;--b1: 100% 0 0}[data-theme=emerald]{color-scheme:light;--b2: 93% 0 0;--b3: 86% 0 0;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 76.6626% .135433 153.450024;--pc: 33.3872% .040618 162.240129;--s: 61.3028% .202368 261.294233;--sc: 100% 0 0;--a: 72.7725% .149783 33.200363;--ac: 0% 0 0;--n: 35.5192% .032071 262.988584;--nc: 98.4625% .001706 247.838921;--b1: 100% 0 0;--bc: 35.5192% .032071 262.988584;--animation-btn: 0;--animation-input: 0;--btn-focus-scale: 1}:root:has(input.theme-controller[value=emerald]:checked){color-scheme:light;--b2: 93% 0 0;--b3: 86% 0 0;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 76.6626% .135433 153.450024;--pc: 33.3872% .040618 162.240129;--s: 61.3028% .202368 261.294233;--sc: 100% 0 0;--a: 72.7725% .149783 33.200363;--ac: 0% 0 0;--n: 35.5192% .032071 262.988584;--nc: 98.4625% .001706 247.838921;--b1: 100% 0 0;--bc: 35.5192% .032071 262.988584;--animation-btn: 0;--animation-input: 0;--btn-focus-scale: 1}[data-theme=corporate]{color-scheme:light;--b2: 93% 0 0;--b3: 86% 0 0;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--pc: 12.078% .0456 269.1;--sc: 13.0739% .010951 256.688055;--ac: 15.3934% .022799 163.57888;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--border-btn: 1px;--tab-border: 1px;--p: 60.39% .228 269.1;--s: 65.3694% .054756 256.688055;--a: 76.9669% .113994 163.57888;--n: 22.3899% .031305 278.07229;--nc: 95.8796% .008588 247.915135;--b1: 100% 0 0;--bc: 22.3899% .031305 278.07229;--rounded-box: .25rem;--rounded-btn: .125rem;--rounded-badge: .125rem;--tab-radius: .25rem;--animation-btn: 0;--animation-input: 0;--btn-focus-scale: 1}:root:has(input.theme-controller[value=corporate]:checked){color-scheme:light;--b2: 93% 0 0;--b3: 86% 0 0;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--pc: 12.078% .0456 269.1;--sc: 13.0739% .010951 256.688055;--ac: 15.3934% .022799 163.57888;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--border-btn: 1px;--tab-border: 1px;--p: 60.39% .228 269.1;--s: 65.3694% .054756 256.688055;--a: 76.9669% .113994 163.57888;--n: 22.3899% .031305 278.07229;--nc: 95.8796% .008588 247.915135;--b1: 100% 0 0;--bc: 22.3899% .031305 278.07229;--rounded-box: .25rem;--rounded-btn: .125rem;--rounded-badge: .125rem;--tab-radius: .25rem;--animation-btn: 0;--animation-input: 0;--btn-focus-scale: 1}[data-theme=synthwave]{color-scheme:dark;--b2: 20.2941% .076211 287.835609;--b3: 18.7665% .070475 287.835609;--pc: 14.4421% .031903 342.009383;--sc: 15.6543% .02362 227.382405;--ac: 17.608% .0412 93.72;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 72.2105% .159514 342.009383;--s: 78.2714% .118101 227.382405;--a: 88.04% .206 93.72;--n: 25.5554% .103537 286.507967;--nc: 97.9365% .00819 301.358346;--b1: 21.8216% .081948 287.835609;--bc: 97.9365% .00819 301.358346;--in: 76.5197% .12273 231.831603;--inc: 23.5017% .096418 290.329844;--su: 86.0572% .115038 178.624677;--suc: 23.5017% .096418 290.329844;--wa: 85.531% .122117 93.722227;--wac: 23.5017% .096418 290.329844;--er: 73.7005% .121339 32.639257;--erc: 23.5017% .096418 290.329844}:root:has(input.theme-controller[value=synthwave]:checked){color-scheme:dark;--b2: 20.2941% .076211 287.835609;--b3: 18.7665% .070475 287.835609;--pc: 14.4421% .031903 342.009383;--sc: 15.6543% .02362 227.382405;--ac: 17.608% .0412 93.72;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 72.2105% .159514 342.009383;--s: 78.2714% .118101 227.382405;--a: 88.04% .206 93.72;--n: 25.5554% .103537 286.507967;--nc: 97.9365% .00819 301.358346;--b1: 21.8216% .081948 287.835609;--bc: 97.9365% .00819 301.358346;--in: 76.5197% .12273 231.831603;--inc: 23.5017% .096418 290.329844;--su: 86.0572% .115038 178.624677;--suc: 23.5017% .096418 290.329844;--wa: 85.531% .122117 93.722227;--wac: 23.5017% .096418 290.329844;--er: 73.7005% .121339 32.639257;--erc: 23.5017% .096418 290.329844}[data-theme=retro]{color-scheme:light;--inc: 90.923% .043042 262.880917;--suc: 12.541% .033982 149.213788;--wac: 13.3168% .031484 58.31834;--erc: 13.144% .0398 27.33;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--p: 76.8664% .104092 22.664655;--pc: 26.5104% .006243 .522862;--s: 80.7415% .052534 159.094608;--sc: 26.5104% .006243 .522862;--a: 70.3919% .125455 52.953428;--ac: 26.5104% .006243 .522862;--n: 28.4181% .009519 355.534017;--nc: 92.5604% .025113 89.217311;--b1: 91.6374% .034554 90.51575;--b2: 88.2722% .049418 91.774344;--b3: 84.133% .065952 90.856665;--bc: 26.5104% .006243 .522862;--in: 54.615% .215208 262.880917;--su: 62.7052% .169912 149.213788;--wa: 66.584% .157422 58.31834;--er: 65.72% .199 27.33;--rounded-box: .4rem;--rounded-btn: .4rem;--rounded-badge: .4rem;--tab-radius: .4rem}:root:has(input.theme-controller[value=retro]:checked){color-scheme:light;--inc: 90.923% .043042 262.880917;--suc: 12.541% .033982 149.213788;--wac: 13.3168% .031484 58.31834;--erc: 13.144% .0398 27.33;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--p: 76.8664% .104092 22.664655;--pc: 26.5104% .006243 .522862;--s: 80.7415% .052534 159.094608;--sc: 26.5104% .006243 .522862;--a: 70.3919% .125455 52.953428;--ac: 26.5104% .006243 .522862;--n: 28.4181% .009519 355.534017;--nc: 92.5604% .025113 89.217311;--b1: 91.6374% .034554 90.51575;--b2: 88.2722% .049418 91.774344;--b3: 84.133% .065952 90.856665;--bc: 26.5104% .006243 .522862;--in: 54.615% .215208 262.880917;--su: 62.7052% .169912 149.213788;--wa: 66.584% .157422 58.31834;--er: 65.72% .199 27.33;--rounded-box: .4rem;--rounded-btn: .4rem;--rounded-badge: .4rem;--tab-radius: .4rem}[data-theme=cyberpunk]{color-scheme:light;--b2: 87.8943% .16647 104.32;--b3: 81.2786% .15394 104.32;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--bc: 18.902% .0358 104.32;--pc: 14.844% .0418 6.35;--sc: 16.666% .0368 204.72;--ac: 14.372% .04352 310.43;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;font-family:ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace;--p: 74.22% .209 6.35;--s: 83.33% .184 204.72;--a: 71.86% .2176 310.43;--n: 23.04% .065 269.31;--nc: 94.51% .179 104.32;--b1: 94.51% .179 104.32;--rounded-box: 0;--rounded-btn: 0;--rounded-badge: 0;--tab-radius: 0}:root:has(input.theme-controller[value=cyberpunk]:checked){color-scheme:light;--b2: 87.8943% .16647 104.32;--b3: 81.2786% .15394 104.32;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--bc: 18.902% .0358 104.32;--pc: 14.844% .0418 6.35;--sc: 16.666% .0368 204.72;--ac: 14.372% .04352 310.43;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;font-family:ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace;--p: 74.22% .209 6.35;--s: 83.33% .184 204.72;--a: 71.86% .2176 310.43;--n: 23.04% .065 269.31;--nc: 94.51% .179 104.32;--b1: 94.51% .179 104.32;--rounded-box: 0;--rounded-btn: 0;--rounded-badge: 0;--tab-radius: 0}[data-theme=valentine]{color-scheme:light;--b2: 88.0567% .024834 337.06289;--b3: 81.4288% .022964 337.06289;--pc: 13.7239% .030755 15.066527;--sc: 14.3942% .029258 293.189609;--ac: 14.2537% .014961 197.828857;--inc: 90.923% .043042 262.880917;--suc: 12.541% .033982 149.213788;--wac: 13.3168% .031484 58.31834;--erc: 14.614% .0414 27.33;--rounded-box: 1rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--p: 68.6197% .153774 15.066527;--s: 71.971% .14629 293.189609;--a: 71.2685% .074804 197.828857;--n: 54.6053% .143342 358.004839;--nc: 90.2701% .037202 336.955191;--b1: 94.6846% .026703 337.06289;--bc: 37.3085% .081131 4.606426;--in: 54.615% .215208 262.880917;--su: 62.7052% .169912 149.213788;--wa: 66.584% .157422 58.31834;--er: 73.07% .207 27.33;--rounded-btn: 1.9rem;--tab-radius: .7rem}:root:has(input.theme-controller[value=valentine]:checked){color-scheme:light;--b2: 88.0567% .024834 337.06289;--b3: 81.4288% .022964 337.06289;--pc: 13.7239% .030755 15.066527;--sc: 14.3942% .029258 293.189609;--ac: 14.2537% .014961 197.828857;--inc: 90.923% .043042 262.880917;--suc: 12.541% .033982 149.213788;--wac: 13.3168% .031484 58.31834;--erc: 14.614% .0414 27.33;--rounded-box: 1rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--p: 68.6197% .153774 15.066527;--s: 71.971% .14629 293.189609;--a: 71.2685% .074804 197.828857;--n: 54.6053% .143342 358.004839;--nc: 90.2701% .037202 336.955191;--b1: 94.6846% .026703 337.06289;--bc: 37.3085% .081131 4.606426;--in: 54.615% .215208 262.880917;--su: 62.7052% .169912 149.213788;--wa: 66.584% .157422 58.31834;--er: 73.07% .207 27.33;--rounded-btn: 1.9rem;--tab-radius: .7rem}[data-theme=halloween]{color-scheme:dark;--b2: 23.0416% 0 0;--b3: 21.3072% 0 0;--bc: 84.9552% 0 0;--sc: 89.196% .0496 305.03;--nc: 84.8742% .009322 65.681484;--inc: 90.923% .043042 262.880917;--suc: 12.541% .033982 149.213788;--wac: 13.3168% .031484 58.31834;--erc: 13.144% .0398 27.33;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 77.48% .204 60.62;--pc: 19.6935% .004671 196.779412;--s: 45.98% .248 305.03;--a: 64.8% .223 136.073479;--ac: 0% 0 0;--n: 24.371% .046608 65.681484;--b1: 24.7759% 0 0;--in: 54.615% .215208 262.880917;--su: 62.7052% .169912 149.213788;--wa: 66.584% .157422 58.31834;--er: 65.72% .199 27.33}:root:has(input.theme-controller[value=halloween]:checked){color-scheme:dark;--b2: 23.0416% 0 0;--b3: 21.3072% 0 0;--bc: 84.9552% 0 0;--sc: 89.196% .0496 305.03;--nc: 84.8742% .009322 65.681484;--inc: 90.923% .043042 262.880917;--suc: 12.541% .033982 149.213788;--wac: 13.3168% .031484 58.31834;--erc: 13.144% .0398 27.33;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 77.48% .204 60.62;--pc: 19.6935% .004671 196.779412;--s: 45.98% .248 305.03;--a: 64.8% .223 136.073479;--ac: 0% 0 0;--n: 24.371% .046608 65.681484;--b1: 24.7759% 0 0;--in: 54.615% .215208 262.880917;--su: 62.7052% .169912 149.213788;--wa: 66.584% .157422 58.31834;--er: 65.72% .199 27.33}[data-theme=garden]{color-scheme:light;--b2: 86.4453% .002011 17.197414;--b3: 79.9386% .00186 17.197414;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--sc: 89.699% .022197 355.095988;--ac: 11.2547% .010859 154.390187;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 62.45% .278 3.83636;--pc: 100% 0 0;--s: 48.4952% .110985 355.095988;--a: 56.2735% .054297 154.390187;--n: 24.1559% .049362 89.070594;--nc: 92.9519% .002163 17.197414;--b1: 92.9519% .002163 17.197414;--bc: 16.9617% .001664 17.32068}:root:has(input.theme-controller[value=garden]:checked){color-scheme:light;--b2: 86.4453% .002011 17.197414;--b3: 79.9386% .00186 17.197414;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--sc: 89.699% .022197 355.095988;--ac: 11.2547% .010859 154.390187;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 62.45% .278 3.83636;--pc: 100% 0 0;--s: 48.4952% .110985 355.095988;--a: 56.2735% .054297 154.390187;--n: 24.1559% .049362 89.070594;--nc: 92.9519% .002163 17.197414;--b1: 92.9519% .002163 17.197414;--bc: 16.9617% .001664 17.32068}[data-theme=forest]{color-scheme:dark;--b2: 17.522% .007709 17.911578;--b3: 16.2032% .007129 17.911578;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--bc: 83.7682% .001658 17.911578;--sc: 13.9553% .027077 168.327128;--ac: 14.1257% .02389 185.713193;--nc: 86.1397% .007806 171.364646;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--rounded-box: 1rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 68.6283% .185567 148.958922;--pc: 0% 0 0;--s: 69.7764% .135385 168.327128;--a: 70.6285% .119451 185.713193;--n: 30.6985% .039032 171.364646;--b1: 18.8409% .00829 17.911578;--rounded-btn: 1.9rem}:root:has(input.theme-controller[value=forest]:checked){color-scheme:dark;--b2: 17.522% .007709 17.911578;--b3: 16.2032% .007129 17.911578;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--bc: 83.7682% .001658 17.911578;--sc: 13.9553% .027077 168.327128;--ac: 14.1257% .02389 185.713193;--nc: 86.1397% .007806 171.364646;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--rounded-box: 1rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 68.6283% .185567 148.958922;--pc: 0% 0 0;--s: 69.7764% .135385 168.327128;--a: 70.6285% .119451 185.713193;--n: 30.6985% .039032 171.364646;--b1: 18.8409% .00829 17.911578;--rounded-btn: 1.9rem}[data-theme=aqua]{color-scheme:dark;--b2: 45.3464% .118611 261.181672;--b3: 41.9333% .109683 261.181672;--bc: 89.7519% .025508 261.181672;--sc: 12.1365% .02175 309.782946;--ac: 18.6854% .020445 94.555431;--nc: 12.2124% .023402 243.760661;--inc: 90.923% .043042 262.880917;--suc: 12.541% .033982 149.213788;--wac: 13.3168% .031484 58.31834;--erc: 14.79% .038 27.33;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 85.6617% .14498 198.6458;--pc: 40.1249% .068266 197.603872;--s: 60.6827% .108752 309.782946;--a: 93.4269% .102225 94.555431;--n: 61.0622% .117009 243.760661;--b1: 48.7596% .127539 261.181672;--in: 54.615% .215208 262.880917;--su: 62.7052% .169912 149.213788;--wa: 66.584% .157422 58.31834;--er: 73.95% .19 27.33}:root:has(input.theme-controller[value=aqua]:checked){color-scheme:dark;--b2: 45.3464% .118611 261.181672;--b3: 41.9333% .109683 261.181672;--bc: 89.7519% .025508 261.181672;--sc: 12.1365% .02175 309.782946;--ac: 18.6854% .020445 94.555431;--nc: 12.2124% .023402 243.760661;--inc: 90.923% .043042 262.880917;--suc: 12.541% .033982 149.213788;--wac: 13.3168% .031484 58.31834;--erc: 14.79% .038 27.33;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 85.6617% .14498 198.6458;--pc: 40.1249% .068266 197.603872;--s: 60.6827% .108752 309.782946;--a: 93.4269% .102225 94.555431;--n: 61.0622% .117009 243.760661;--b1: 48.7596% .127539 261.181672;--in: 54.615% .215208 262.880917;--su: 62.7052% .169912 149.213788;--wa: 66.584% .157422 58.31834;--er: 73.95% .19 27.33}[data-theme=lofi]{color-scheme:light;--inc: 15.908% .0206 205.9;--suc: 18.026% .0306 164.14;--wac: 17.674% .027 79.94;--erc: 15.732% .03 28.47;--border-btn: 1px;--tab-border: 1px;--p: 15.9066% 0 0;--pc: 100% 0 0;--s: 21.455% .001566 17.278957;--sc: 100% 0 0;--a: 26.8618% 0 0;--ac: 100% 0 0;--n: 0% 0 0;--nc: 100% 0 0;--b1: 100% 0 0;--b2: 96.1151% 0 0;--b3: 92.268% .001082 17.17934;--bc: 0% 0 0;--in: 79.54% .103 205.9;--su: 90.13% .153 164.14;--wa: 88.37% .135 79.94;--er: 78.66% .15 28.47;--rounded-box: .25rem;--rounded-btn: .125rem;--rounded-badge: .125rem;--tab-radius: .125rem;--animation-btn: 0;--animation-input: 0;--btn-focus-scale: 1}:root:has(input.theme-controller[value=lofi]:checked){color-scheme:light;--inc: 15.908% .0206 205.9;--suc: 18.026% .0306 164.14;--wac: 17.674% .027 79.94;--erc: 15.732% .03 28.47;--border-btn: 1px;--tab-border: 1px;--p: 15.9066% 0 0;--pc: 100% 0 0;--s: 21.455% .001566 17.278957;--sc: 100% 0 0;--a: 26.8618% 0 0;--ac: 100% 0 0;--n: 0% 0 0;--nc: 100% 0 0;--b1: 100% 0 0;--b2: 96.1151% 0 0;--b3: 92.268% .001082 17.17934;--bc: 0% 0 0;--in: 79.54% .103 205.9;--su: 90.13% .153 164.14;--wa: 88.37% .135 79.94;--er: 78.66% .15 28.47;--rounded-box: .25rem;--rounded-btn: .125rem;--rounded-badge: .125rem;--tab-radius: .125rem;--animation-btn: 0;--animation-input: 0;--btn-focus-scale: 1}[data-theme=pastel]{color-scheme:light;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--bc: 20% 0 0;--pc: 16.6166% .006979 316.8737;--sc: 17.6153% .009839 8.688364;--ac: 17.8419% .012056 170.923263;--nc: 14.2681% .014702 228.183906;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--rounded-box: 1rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--p: 83.0828% .034896 316.8737;--s: 88.0763% .049197 8.688364;--a: 89.2096% .06028 170.923263;--n: 71.3406% .07351 228.183906;--b1: 100% 0 0;--b2: 98.4625% .001706 247.838921;--b3: 87.1681% .009339 258.338227;--rounded-btn: 1.9rem;--tab-radius: .7rem}:root:has(input.theme-controller[value=pastel]:checked){color-scheme:light;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--bc: 20% 0 0;--pc: 16.6166% .006979 316.8737;--sc: 17.6153% .009839 8.688364;--ac: 17.8419% .012056 170.923263;--nc: 14.2681% .014702 228.183906;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--rounded-box: 1rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--p: 83.0828% .034896 316.8737;--s: 88.0763% .049197 8.688364;--a: 89.2096% .06028 170.923263;--n: 71.3406% .07351 228.183906;--b1: 100% 0 0;--b2: 98.4625% .001706 247.838921;--b3: 87.1681% .009339 258.338227;--rounded-btn: 1.9rem;--tab-radius: .7rem}[data-theme=fantasy]{color-scheme:light;--b2: 93% 0 0;--b3: 86% 0 0;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--pc: 87.49% .0378 325.02;--sc: 90.784% .0324 241.36;--ac: 15.196% .0408 56.72;--nc: 85.5616% .005919 256.847952;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 37.45% .189 325.02;--s: 53.92% .162 241.36;--a: 75.98% .204 56.72;--n: 27.8078% .029596 256.847952;--b1: 100% 0 0;--bc: 27.8078% .029596 256.847952}:root:has(input.theme-controller[value=fantasy]:checked){color-scheme:light;--b2: 93% 0 0;--b3: 86% 0 0;--in: 72.06% .191 231.6;--su: 64.8% .15 160;--wa: 84.71% .199 83.87;--er: 71.76% .221 22.18;--pc: 87.49% .0378 325.02;--sc: 90.784% .0324 241.36;--ac: 15.196% .0408 56.72;--nc: 85.5616% .005919 256.847952;--inc: 0% 0 0;--suc: 0% 0 0;--wac: 0% 0 0;--erc: 0% 0 0;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 37.45% .189 325.02;--s: 53.92% .162 241.36;--a: 75.98% .204 56.72;--n: 27.8078% .029596 256.847952;--b1: 100% 0 0;--bc: 27.8078% .029596 256.847952}[data-theme=wireframe]{color-scheme:light;--bc: 20% 0 0;--pc: 15.6521% 0 0;--sc: 15.6521% 0 0;--ac: 15.6521% 0 0;--nc: 18.8014% 0 0;--inc: 89.0403% .062643 264.052021;--suc: 90.395% .035372 142.495339;--wac: 14.1626% .019994 108.702381;--erc: 12.5591% .051537 29.233885;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;font-family:Chalkboard,comic sans ms,"sans-serif";--p: 78.2604% 0 0;--s: 78.2604% 0 0;--a: 78.2604% 0 0;--n: 94.007% 0 0;--b1: 100% 0 0;--b2: 94.9119% 0 0;--b3: 89.7547% 0 0;--in: 45.2014% .313214 264.052021;--su: 51.9752% .176858 142.495339;--wa: 70.8131% .099969 108.702381;--er: 62.7955% .257683 29.233885;--rounded-box: .2rem;--rounded-btn: .2rem;--rounded-badge: .2rem;--tab-radius: .2rem}:root:has(input.theme-controller[value=wireframe]:checked){color-scheme:light;--bc: 20% 0 0;--pc: 15.6521% 0 0;--sc: 15.6521% 0 0;--ac: 15.6521% 0 0;--nc: 18.8014% 0 0;--inc: 89.0403% .062643 264.052021;--suc: 90.395% .035372 142.495339;--wac: 14.1626% .019994 108.702381;--erc: 12.5591% .051537 29.233885;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;font-family:Chalkboard,comic sans ms,"sans-serif";--p: 78.2604% 0 0;--s: 78.2604% 0 0;--a: 78.2604% 0 0;--n: 94.007% 0 0;--b1: 100% 0 0;--b2: 94.9119% 0 0;--b3: 89.7547% 0 0;--in: 45.2014% .313214 264.052021;--su: 51.9752% .176858 142.495339;--wa: 70.8131% .099969 108.702381;--er: 62.7955% .257683 29.233885;--rounded-box: .2rem;--rounded-btn: .2rem;--rounded-badge: .2rem;--tab-radius: .2rem}[data-theme=black]{color-scheme:dark;--pc: 86.736% 0 0;--sc: 86.736% 0 0;--ac: 86.736% 0 0;--nc: 86.736% 0 0;--inc: 89.0403% .062643 264.052021;--suc: 90.395% .035372 142.495339;--wac: 19.3597% .042201 109.769232;--erc: 12.5591% .051537 29.233885;--border-btn: 1px;--tab-border: 1px;--p: 33.6799% 0 0;--s: 33.6799% 0 0;--a: 33.6799% 0 0;--b1: 0% 0 0;--b2: 19.1251% 0 0;--b3: 26.8618% 0 0;--bc: 87.6096% 0 0;--n: 33.6799% 0 0;--in: 45.2014% .313214 264.052021;--su: 51.9752% .176858 142.495339;--wa: 96.7983% .211006 109.769232;--er: 62.7955% .257683 29.233885;--rounded-box: 0;--rounded-btn: 0;--rounded-badge: 0;--animation-btn: 0;--animation-input: 0;--btn-focus-scale: 1;--tab-radius: 0}:root:has(input.theme-controller[value=black]:checked){color-scheme:dark;--pc: 86.736% 0 0;--sc: 86.736% 0 0;--ac: 86.736% 0 0;--nc: 86.736% 0 0;--inc: 89.0403% .062643 264.052021;--suc: 90.395% .035372 142.495339;--wac: 19.3597% .042201 109.769232;--erc: 12.5591% .051537 29.233885;--border-btn: 1px;--tab-border: 1px;--p: 33.6799% 0 0;--s: 33.6799% 0 0;--a: 33.6799% 0 0;--b1: 0% 0 0;--b2: 19.1251% 0 0;--b3: 26.8618% 0 0;--bc: 87.6096% 0 0;--n: 33.6799% 0 0;--in: 45.2014% .313214 264.052021;--su: 51.9752% .176858 142.495339;--wa: 96.7983% .211006 109.769232;--er: 62.7955% .257683 29.233885;--rounded-box: 0;--rounded-btn: 0;--rounded-badge: 0;--animation-btn: 0;--animation-input: 0;--btn-focus-scale: 1;--tab-radius: 0}[data-theme=luxury]{color-scheme:dark;--pc: 20% 0 0;--sc: 85.5163% .012821 261.069149;--ac: 87.3349% .010348 338.82597;--inc: 15.8122% .024356 237.133883;--suc: 15.6239% .038579 132.154381;--wac: 17.2255% .027305 102.89115;--erc: 14.3506% .035271 22.568916;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 100% 0 0;--s: 27.5815% .064106 261.069149;--a: 36.6744% .051741 338.82597;--n: 24.27% .057015 59.825019;--nc: 93.2033% .089631 90.861683;--b1: 14.0765% .004386 285.822869;--b2: 20.2191% .004211 308.22937;--b3: 29.8961% .003818 308.318612;--bc: 75.6879% .123666 76.890484;--in: 79.0612% .121778 237.133883;--su: 78.1197% .192894 132.154381;--wa: 86.1274% .136524 102.89115;--er: 71.7531% .176357 22.568916}:root:has(input.theme-controller[value=luxury]:checked){color-scheme:dark;--pc: 20% 0 0;--sc: 85.5163% .012821 261.069149;--ac: 87.3349% .010348 338.82597;--inc: 15.8122% .024356 237.133883;--suc: 15.6239% .038579 132.154381;--wac: 17.2255% .027305 102.89115;--erc: 14.3506% .035271 22.568916;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 100% 0 0;--s: 27.5815% .064106 261.069149;--a: 36.6744% .051741 338.82597;--n: 24.27% .057015 59.825019;--nc: 93.2033% .089631 90.861683;--b1: 14.0765% .004386 285.822869;--b2: 20.2191% .004211 308.22937;--b3: 29.8961% .003818 308.318612;--bc: 75.6879% .123666 76.890484;--in: 79.0612% .121778 237.133883;--su: 78.1197% .192894 132.154381;--wa: 86.1274% .136524 102.89115;--er: 71.7531% .176357 22.568916}[data-theme=dracula]{color-scheme:dark;--b2: 26.8053% .020556 277.508664;--b3: 24.7877% .019009 277.508664;--pc: 15.0922% .036614 346.812432;--sc: 14.8405% .029709 301.883095;--ac: 16.6785% .024826 66.558491;--nc: 87.8891% .006515 275.524078;--inc: 17.6526% .018676 212.846491;--suc: 17.4199% .043903 148.024881;--wac: 19.1068% .026849 112.757109;--erc: 13.6441% .041266 24.430965;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 75.4611% .18307 346.812432;--s: 74.2023% .148546 301.883095;--a: 83.3927% .124132 66.558491;--n: 39.4456% .032576 275.524078;--b1: 28.8229% .022103 277.508664;--bc: 97.7477% .007913 106.545019;--in: 88.263% .09338 212.846491;--su: 87.0995% .219516 148.024881;--wa: 95.5338% .134246 112.757109;--er: 68.2204% .206328 24.430965}:root:has(input.theme-controller[value=dracula]:checked){color-scheme:dark;--b2: 26.8053% .020556 277.508664;--b3: 24.7877% .019009 277.508664;--pc: 15.0922% .036614 346.812432;--sc: 14.8405% .029709 301.883095;--ac: 16.6785% .024826 66.558491;--nc: 87.8891% .006515 275.524078;--inc: 17.6526% .018676 212.846491;--suc: 17.4199% .043903 148.024881;--wac: 19.1068% .026849 112.757109;--erc: 13.6441% .041266 24.430965;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 75.4611% .18307 346.812432;--s: 74.2023% .148546 301.883095;--a: 83.3927% .124132 66.558491;--n: 39.4456% .032576 275.524078;--b1: 28.8229% .022103 277.508664;--bc: 97.7477% .007913 106.545019;--in: 88.263% .09338 212.846491;--su: 87.0995% .219516 148.024881;--wa: 95.5338% .134246 112.757109;--er: 68.2204% .206328 24.430965}[data-theme=cmyk]{color-scheme:light;--b2: 93% 0 0;--b3: 86% 0 0;--bc: 20% 0 0;--pc: 14.3544% .02666 239.443325;--sc: 12.8953% .040552 359.339283;--ac: 18.8458% .037948 105.306968;--nc: 84.3557% 0 0;--inc: 13.6952% .0189 217.284104;--suc: 89.3898% .032505 321.406278;--wac: 14.2473% .031969 52.023412;--erc: 12.4027% .041677 28.717543;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 71.7722% .133298 239.443325;--s: 64.4766% .202758 359.339283;--a: 94.2289% .189741 105.306968;--n: 21.7787% 0 0;--b1: 100% 0 0;--in: 68.4759% .094499 217.284104;--su: 46.949% .162524 321.406278;--wa: 71.2364% .159843 52.023412;--er: 62.0133% .208385 28.717543}:root:has(input.theme-controller[value=cmyk]:checked){color-scheme:light;--b2: 93% 0 0;--b3: 86% 0 0;--bc: 20% 0 0;--pc: 14.3544% .02666 239.443325;--sc: 12.8953% .040552 359.339283;--ac: 18.8458% .037948 105.306968;--nc: 84.3557% 0 0;--inc: 13.6952% .0189 217.284104;--suc: 89.3898% .032505 321.406278;--wac: 14.2473% .031969 52.023412;--erc: 12.4027% .041677 28.717543;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 71.7722% .133298 239.443325;--s: 64.4766% .202758 359.339283;--a: 94.2289% .189741 105.306968;--n: 21.7787% 0 0;--b1: 100% 0 0;--in: 68.4759% .094499 217.284104;--su: 46.949% .162524 321.406278;--wa: 71.2364% .159843 52.023412;--er: 62.0133% .208385 28.717543}[data-theme=autumn]{color-scheme:light;--b2: 89.1077% 0 0;--b3: 82.4006% 0 0;--bc: 19.1629% 0 0;--pc: 88.1446% .032232 17.530175;--sc: 12.3353% .033821 23.865865;--ac: 14.6851% .018999 60.729616;--nc: 90.8734% .007475 51.902819;--inc: 13.8449% .019596 207.284192;--suc: 12.199% .016032 174.616213;--wac: 14.0163% .032982 56.844303;--erc: 90.614% .0482 24.16;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 40.7232% .16116 17.530175;--s: 61.6763% .169105 23.865865;--a: 73.4253% .094994 60.729616;--n: 54.3672% .037374 51.902819;--b1: 95.8147% 0 0;--in: 69.2245% .097979 207.284192;--su: 60.9951% .080159 174.616213;--wa: 70.0817% .164909 56.844303;--er: 53.07% .241 24.16}:root:has(input.theme-controller[value=autumn]:checked){color-scheme:light;--b2: 89.1077% 0 0;--b3: 82.4006% 0 0;--bc: 19.1629% 0 0;--pc: 88.1446% .032232 17.530175;--sc: 12.3353% .033821 23.865865;--ac: 14.6851% .018999 60.729616;--nc: 90.8734% .007475 51.902819;--inc: 13.8449% .019596 207.284192;--suc: 12.199% .016032 174.616213;--wac: 14.0163% .032982 56.844303;--erc: 90.614% .0482 24.16;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 40.7232% .16116 17.530175;--s: 61.6763% .169105 23.865865;--a: 73.4253% .094994 60.729616;--n: 54.3672% .037374 51.902819;--b1: 95.8147% 0 0;--in: 69.2245% .097979 207.284192;--su: 60.9951% .080159 174.616213;--wa: 70.0817% .164909 56.844303;--er: 53.07% .241 24.16}[data-theme=business]{color-scheme:dark;--b2: 22.6487% 0 0;--b3: 20.944% 0 0;--bc: 84.8707% 0 0;--pc: 88.3407% .019811 251.473931;--sc: 12.8185% .005481 229.389418;--ac: 13.4542% .033545 35.791525;--nc: 85.4882% .00265 253.041249;--inc: 12.5233% .028702 240.033697;--suc: 14.0454% .018919 156.59611;--wac: 15.4965% .023141 81.519177;--erc: 90.3221% .029356 29.674507;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 41.7036% .099057 251.473931;--s: 64.0924% .027405 229.389418;--a: 67.271% .167726 35.791525;--n: 27.441% .01325 253.041249;--b1: 24.3535% 0 0;--in: 62.6163% .143511 240.033697;--su: 70.2268% .094594 156.59611;--wa: 77.4824% .115704 81.519177;--er: 51.6105% .14678 29.674507;--rounded-box: .25rem;--rounded-btn: .125rem;--rounded-badge: .125rem}:root:has(input.theme-controller[value=business]:checked){color-scheme:dark;--b2: 22.6487% 0 0;--b3: 20.944% 0 0;--bc: 84.8707% 0 0;--pc: 88.3407% .019811 251.473931;--sc: 12.8185% .005481 229.389418;--ac: 13.4542% .033545 35.791525;--nc: 85.4882% .00265 253.041249;--inc: 12.5233% .028702 240.033697;--suc: 14.0454% .018919 156.59611;--wac: 15.4965% .023141 81.519177;--erc: 90.3221% .029356 29.674507;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 41.7036% .099057 251.473931;--s: 64.0924% .027405 229.389418;--a: 67.271% .167726 35.791525;--n: 27.441% .01325 253.041249;--b1: 24.3535% 0 0;--in: 62.6163% .143511 240.033697;--su: 70.2268% .094594 156.59611;--wa: 77.4824% .115704 81.519177;--er: 51.6105% .14678 29.674507;--rounded-box: .25rem;--rounded-btn: .125rem;--rounded-badge: .125rem}[data-theme=acid]{color-scheme:light;--b2: 91.6146% 0 0;--b3: 84.7189% 0 0;--bc: 19.7021% 0 0;--pc: 14.38% .0714 330.759573;--sc: 14.674% .0448 48.250878;--ac: 18.556% .0528 122.962951;--nc: 84.262% .0256 278.68;--inc: 12.144% .0454 252.05;--suc: 17.144% .0532 158.53;--wac: 18.202% .0424 100.5;--erc: 12.968% .0586 29.349188;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--p: 71.9% .357 330.759573;--s: 73.37% .224 48.250878;--a: 92.78% .264 122.962951;--n: 21.31% .128 278.68;--b1: 98.5104% 0 0;--in: 60.72% .227 252.05;--su: 85.72% .266 158.53;--wa: 91.01% .212 100.5;--er: 64.84% .293 29.349188;--rounded-box: 1.25rem;--rounded-btn: 1rem;--rounded-badge: 1rem;--tab-radius: .7rem}:root:has(input.theme-controller[value=acid]:checked){color-scheme:light;--b2: 91.6146% 0 0;--b3: 84.7189% 0 0;--bc: 19.7021% 0 0;--pc: 14.38% .0714 330.759573;--sc: 14.674% .0448 48.250878;--ac: 18.556% .0528 122.962951;--nc: 84.262% .0256 278.68;--inc: 12.144% .0454 252.05;--suc: 17.144% .0532 158.53;--wac: 18.202% .0424 100.5;--erc: 12.968% .0586 29.349188;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--p: 71.9% .357 330.759573;--s: 73.37% .224 48.250878;--a: 92.78% .264 122.962951;--n: 21.31% .128 278.68;--b1: 98.5104% 0 0;--in: 60.72% .227 252.05;--su: 85.72% .266 158.53;--wa: 91.01% .212 100.5;--er: 64.84% .293 29.349188;--rounded-box: 1.25rem;--rounded-btn: 1rem;--rounded-badge: 1rem;--tab-radius: .7rem}[data-theme=lemonade]{color-scheme:light;--b2: 91.8003% .0186 123.72;--b3: 84.8906% .0172 123.72;--bc: 19.742% .004 123.72;--pc: 11.784% .0398 134.6;--sc: 15.55% .0392 111.09;--ac: 17.078% .0402 100.73;--nc: 86.196% .015 108.6;--inc: 17.238% .0094 224.14;--suc: 17.238% .0094 157.85;--wac: 17.238% .0094 102.15;--erc: 17.238% .0094 25.85;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 58.92% .199 134.6;--s: 77.75% .196 111.09;--a: 85.39% .201 100.73;--n: 30.98% .075 108.6;--b1: 98.71% .02 123.72;--in: 86.19% .047 224.14;--su: 86.19% .047 157.85;--wa: 86.19% .047 102.15;--er: 86.19% .047 25.85}:root:has(input.theme-controller[value=lemonade]:checked){color-scheme:light;--b2: 91.8003% .0186 123.72;--b3: 84.8906% .0172 123.72;--bc: 19.742% .004 123.72;--pc: 11.784% .0398 134.6;--sc: 15.55% .0392 111.09;--ac: 17.078% .0402 100.73;--nc: 86.196% .015 108.6;--inc: 17.238% .0094 224.14;--suc: 17.238% .0094 157.85;--wac: 17.238% .0094 102.15;--erc: 17.238% .0094 25.85;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 58.92% .199 134.6;--s: 77.75% .196 111.09;--a: 85.39% .201 100.73;--n: 30.98% .075 108.6;--b1: 98.71% .02 123.72;--in: 86.19% .047 224.14;--su: 86.19% .047 157.85;--wa: 86.19% .047 102.15;--er: 86.19% .047 25.85}[data-theme=night]{color-scheme:dark;--b2: 19.3144% .037037 265.754874;--b3: 17.8606% .034249 265.754874;--bc: 84.1536% .007965 265.754874;--pc: 15.0703% .027798 232.66148;--sc: 13.6023% .031661 276.934902;--ac: 14.4721% .035244 350.048739;--nc: 85.5899% .00737 260.030984;--suc: 15.6904% .026506 181.911977;--wac: 16.6486% .027912 82.95003;--erc: 14.3572% .034051 13.11834;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 75.3513% .138989 232.66148;--s: 68.0113% .158303 276.934902;--a: 72.3603% .176218 350.048739;--n: 27.9495% .036848 260.030984;--b1: 20.7682% .039824 265.754874;--in: 68.4553% .148062 237.25135;--inc: 0% 0 0;--su: 78.452% .132529 181.911977;--wa: 83.2428% .139558 82.95003;--er: 71.7858% .170255 13.11834}:root:has(input.theme-controller[value=night]:checked){color-scheme:dark;--b2: 19.3144% .037037 265.754874;--b3: 17.8606% .034249 265.754874;--bc: 84.1536% .007965 265.754874;--pc: 15.0703% .027798 232.66148;--sc: 13.6023% .031661 276.934902;--ac: 14.4721% .035244 350.048739;--nc: 85.5899% .00737 260.030984;--suc: 15.6904% .026506 181.911977;--wac: 16.6486% .027912 82.95003;--erc: 14.3572% .034051 13.11834;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 75.3513% .138989 232.66148;--s: 68.0113% .158303 276.934902;--a: 72.3603% .176218 350.048739;--n: 27.9495% .036848 260.030984;--b1: 20.7682% .039824 265.754874;--in: 68.4553% .148062 237.25135;--inc: 0% 0 0;--su: 78.452% .132529 181.911977;--wa: 83.2428% .139558 82.95003;--er: 71.7858% .170255 13.11834}[data-theme=coffee]{color-scheme:dark;--b2: 20.1585% .021457 329.708637;--b3: 18.6412% .019842 329.708637;--pc: 14.3993% .024765 62.756393;--sc: 86.893% .00597 199.19444;--ac: 88.5243% .014881 224.389184;--nc: 83.3022% .003149 326.261446;--inc: 15.898% .012774 184.558367;--suc: 14.9445% .014491 131.116276;--wac: 17.6301% .028162 87.722413;--erc: 15.4637% .025644 31.871922;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 71.9967% .123825 62.756393;--s: 34.465% .029849 199.19444;--a: 42.6213% .074405 224.389184;--n: 16.5109% .015743 326.261446;--b1: 21.6758% .023072 329.708637;--bc: 72.3547% .092794 79.129387;--in: 79.4902% .063869 184.558367;--su: 74.7224% .072456 131.116276;--wa: 88.1503% .140812 87.722413;--er: 77.3187% .12822 31.871922}:root:has(input.theme-controller[value=coffee]:checked){color-scheme:dark;--b2: 20.1585% .021457 329.708637;--b3: 18.6412% .019842 329.708637;--pc: 14.3993% .024765 62.756393;--sc: 86.893% .00597 199.19444;--ac: 88.5243% .014881 224.389184;--nc: 83.3022% .003149 326.261446;--inc: 15.898% .012774 184.558367;--suc: 14.9445% .014491 131.116276;--wac: 17.6301% .028162 87.722413;--erc: 15.4637% .025644 31.871922;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 71.9967% .123825 62.756393;--s: 34.465% .029849 199.19444;--a: 42.6213% .074405 224.389184;--n: 16.5109% .015743 326.261446;--b1: 21.6758% .023072 329.708637;--bc: 72.3547% .092794 79.129387;--in: 79.4902% .063869 184.558367;--su: 74.7224% .072456 131.116276;--wa: 88.1503% .140812 87.722413;--er: 77.3187% .12822 31.871922}[data-theme=winter]{color-scheme:light;--pc: 91.372% .051 257.57;--sc: 88.5103% .03222 282.339433;--ac: 11.988% .038303 335.171434;--nc: 83.9233% .012704 257.651965;--inc: 17.6255% .017178 214.515264;--suc: 16.0988% .015404 197.823719;--wac: 17.8345% .009167 71.47031;--erc: 14.6185% .022037 20.076293;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 56.86% .255 257.57;--s: 42.5516% .161098 282.339433;--a: 59.9398% .191515 335.171434;--n: 19.6166% .063518 257.651965;--b1: 100% 0 0;--b2: 97.4663% .011947 259.822565;--b3: 93.2686% .016223 262.751375;--bc: 41.8869% .053885 255.824911;--in: 88.1275% .085888 214.515264;--su: 80.4941% .077019 197.823719;--wa: 89.1725% .045833 71.47031;--er: 73.0926% .110185 20.076293}:root:has(input.theme-controller[value=winter]:checked){color-scheme:light;--pc: 91.372% .051 257.57;--sc: 88.5103% .03222 282.339433;--ac: 11.988% .038303 335.171434;--nc: 83.9233% .012704 257.651965;--inc: 17.6255% .017178 214.515264;--suc: 16.0988% .015404 197.823719;--wac: 17.8345% .009167 71.47031;--erc: 14.6185% .022037 20.076293;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 56.86% .255 257.57;--s: 42.5516% .161098 282.339433;--a: 59.9398% .191515 335.171434;--n: 19.6166% .063518 257.651965;--b1: 100% 0 0;--b2: 97.4663% .011947 259.822565;--b3: 93.2686% .016223 262.751375;--bc: 41.8869% .053885 255.824911;--in: 88.1275% .085888 214.515264;--su: 80.4941% .077019 197.823719;--wa: 89.1725% .045833 71.47031;--er: 73.0926% .110185 20.076293}[data-theme=dim]{color-scheme:dark;--pc: 17.2267% .028331 139.549991;--sc: 14.6752% .033181 35.353059;--ac: 14.8459% .026728 311.37924;--inc: 17.2157% .028409 206.182959;--suc: 17.2343% .028437 166.534048;--wac: 17.2327% .028447 94.818679;--erc: 16.4838% .019914 33.756357;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 86.1335% .141656 139.549991;--s: 73.3759% .165904 35.353059;--a: 74.2296% .133641 311.37924;--n: 24.7311% .020483 264.094728;--nc: 82.9011% .031335 222.959324;--b1: 30.8577% .023243 264.149498;--b2: 28.0368% .01983 264.182074;--b3: 26.3469% .018403 262.177739;--bc: 82.9011% .031335 222.959324;--in: 86.0785% .142046 206.182959;--su: 86.1717% .142187 166.534048;--wa: 86.1634% .142236 94.818679;--er: 82.4189% .09957 33.756357}:root:has(input.theme-controller[value=dim]:checked){color-scheme:dark;--pc: 17.2267% .028331 139.549991;--sc: 14.6752% .033181 35.353059;--ac: 14.8459% .026728 311.37924;--inc: 17.2157% .028409 206.182959;--suc: 17.2343% .028437 166.534048;--wac: 17.2327% .028447 94.818679;--erc: 16.4838% .019914 33.756357;--rounded-box: 1rem;--rounded-btn: .5rem;--rounded-badge: 1.9rem;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--tab-radius: .5rem;--p: 86.1335% .141656 139.549991;--s: 73.3759% .165904 35.353059;--a: 74.2296% .133641 311.37924;--n: 24.7311% .020483 264.094728;--nc: 82.9011% .031335 222.959324;--b1: 30.8577% .023243 264.149498;--b2: 28.0368% .01983 264.182074;--b3: 26.3469% .018403 262.177739;--bc: 82.9011% .031335 222.959324;--in: 86.0785% .142046 206.182959;--su: 86.1717% .142187 166.534048;--wa: 86.1634% .142236 94.818679;--er: 82.4189% .09957 33.756357}[data-theme=nord]{color-scheme:light;--pc: 11.8872% .015449 254.027774;--sc: 13.9303% .011822 248.687186;--ac: 15.4929% .01245 217.469017;--inc: 13.8414% .012499 332.664922;--suc: 15.3654% .01498 131.063061;--wac: 17.0972% .017847 84.093335;--erc: 12.122% .024119 15.341883;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--p: 59.4359% .077246 254.027774;--s: 69.6516% .059108 248.687186;--a: 77.4643% .062249 217.469017;--n: 45.229% .035214 264.1312;--nc: 89.9258% .016374 262.749256;--b1: 95.1276% .007445 260.731539;--b2: 93.2996% .010389 261.788485;--b3: 89.9258% .016374 262.749256;--bc: 32.4374% .022945 264.182036;--in: 69.2072% .062496 332.664922;--su: 76.827% .074899 131.063061;--wa: 85.4862% .089234 84.093335;--er: 60.61% .120594 15.341883;--rounded-box: .4rem;--rounded-btn: .2rem;--rounded-badge: .4rem;--tab-radius: .2rem}:root:has(input.theme-controller[value=nord]:checked){color-scheme:light;--pc: 11.8872% .015449 254.027774;--sc: 13.9303% .011822 248.687186;--ac: 15.4929% .01245 217.469017;--inc: 13.8414% .012499 332.664922;--suc: 15.3654% .01498 131.063061;--wac: 17.0972% .017847 84.093335;--erc: 12.122% .024119 15.341883;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--p: 59.4359% .077246 254.027774;--s: 69.6516% .059108 248.687186;--a: 77.4643% .062249 217.469017;--n: 45.229% .035214 264.1312;--nc: 89.9258% .016374 262.749256;--b1: 95.1276% .007445 260.731539;--b2: 93.2996% .010389 261.788485;--b3: 89.9258% .016374 262.749256;--bc: 32.4374% .022945 264.182036;--in: 69.2072% .062496 332.664922;--su: 76.827% .074899 131.063061;--wa: 85.4862% .089234 84.093335;--er: 60.61% .120594 15.341883;--rounded-box: .4rem;--rounded-btn: .2rem;--rounded-badge: .4rem;--tab-radius: .2rem}[data-theme=sunset]{color-scheme:dark;--pc: 14.9408% .031656 39.94703;--sc: 14.5075% .035531 2.72034;--ac: 14.2589% .033336 299.844533;--inc: 17.1119% .017054 206.015183;--suc: 17.1122% .017172 144.77874;--wac: 17.1139% .016961 74.427797;--erc: 17.1023% .015778 16.886379;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--p: 74.7039% .158278 39.94703;--s: 72.5375% .177654 2.72034;--a: 71.2947% .166678 299.844533;--n: 26% .019 237.69;--nc: 70% .019 237.69;--b1: 22% .019 237.69;--b2: 20% .019 237.69;--b3: 18% .019 237.69;--bc: 77.3835% .043586 245.096534;--in: 85.5596% .085271 206.015183;--su: 85.5609% .08586 144.77874;--wa: 85.5695% .084806 74.427797;--er: 85.5116% .07889 16.886379;--rounded-box: 1.2rem;--rounded-btn: .8rem;--rounded-badge: .4rem;--tab-radius: .7rem}:root:has(input.theme-controller[value=sunset]:checked){color-scheme:dark;--pc: 14.9408% .031656 39.94703;--sc: 14.5075% .035531 2.72034;--ac: 14.2589% .033336 299.844533;--inc: 17.1119% .017054 206.015183;--suc: 17.1122% .017172 144.77874;--wac: 17.1139% .016961 74.427797;--erc: 17.1023% .015778 16.886379;--animation-btn: .25s;--animation-input: .2s;--btn-focus-scale: .95;--border-btn: 1px;--tab-border: 1px;--p: 74.7039% .158278 39.94703;--s: 72.5375% .177654 2.72034;--a: 71.2947% .166678 299.844533;--n: 26% .019 237.69;--nc: 70% .019 237.69;--b1: 22% .019 237.69;--b2: 20% .019 237.69;--b3: 18% .019 237.69;--bc: 77.3835% .043586 245.096534;--in: 85.5596% .085271 206.015183;--su: 85.5609% .08586 144.77874;--wa: 85.5695% .084806 74.427797;--er: 85.5116% .07889 16.886379;--rounded-box: 1.2rem;--rounded-btn: .8rem;--rounded-badge: .4rem;--tab-radius: .7rem}.alert{display:grid;width:100%;grid-auto-flow:row;align-content:flex-start;align-items:center;justify-items:center;gap:1rem;text-align:center;border-radius:var(--rounded-box, 1rem);border-width:1px;--tw-border-opacity: 1;border-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-border-opacity)));padding:1rem;--tw-text-opacity: 1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));--alert-bg: var(--fallback-b2,oklch(var(--b2)/1));--alert-bg-mix: var(--fallback-b1,oklch(var(--b1)/1));background-color:var(--alert-bg)}@media (min-width: 640px){.alert{grid-auto-flow:column;grid-template-columns:auto minmax(auto,1fr);justify-items:start;text-align:start}}.avatar.placeholder>div{display:flex;align-items:center;justify-content:center}.badge{display:inline-flex;align-items:center;justify-content:center;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,-webkit-backdrop-filter;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter,-webkit-backdrop-filter;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-timing-function:cubic-bezier(0,0,.2,1);transition-duration:.2s;height:1.25rem;font-size:.875rem;line-height:1.25rem;width:-moz-fit-content;width:fit-content;padding-left:.563rem;padding-right:.563rem;border-radius:var(--rounded-badge, 1.9rem);border-width:1px;--tw-border-opacity: 1;border-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-border-opacity)));--tw-bg-opacity: 1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)));--tw-text-opacity: 1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)))}@media (hover:hover){.label a:hover{--tw-text-opacity: 1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)))}.menu li>*:not(ul,.menu-title,details,.btn):active,.menu li>*:not(ul,.menu-title,details,.btn).active,.menu li>details>summary:active{--tw-bg-opacity: 1;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-text-opacity: 1;color:var(--fallback-nc,oklch(var(--nc)/var(--tw-text-opacity)))}}.btn{display:inline-flex;height:3rem;min-height:3rem;flex-shrink:0;cursor:pointer;-webkit-user-select:none;-moz-user-select:none;user-select:none;flex-wrap:wrap;align-items:center;justify-content:center;border-radius:var(--rounded-btn, .5rem);border-color:transparent;border-color:oklch(var(--btn-color, var(--b2)) / var(--tw-border-opacity));padding-left:1rem;padding-right:1rem;text-align:center;font-size:.875rem;line-height:1em;gap:.5rem;font-weight:600;text-decoration-line:none;transition-duration:.2s;transition-timing-function:cubic-bezier(0,0,.2,1);border-width:var(--border-btn, 1px);transition-property:color,background-color,border-color,opacity,box-shadow,transform;--tw-text-opacity: 1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));--tw-shadow: 0 1px 2px 0 rgb(0 0 0 / .05);--tw-shadow-colored: 0 1px 2px 0 var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow, 0 0 #0000),var(--tw-ring-shadow, 0 0 #0000),var(--tw-shadow);outline-color:var(--fallback-bc,oklch(var(--bc)/1));background-color:oklch(var(--btn-color, var(--b2)) / var(--tw-bg-opacity));--tw-bg-opacity: 1;--tw-border-opacity: 1}.btn-disabled,.btn[disabled],.btn:disabled{pointer-events:none}:where(.btn:is(input[type=checkbox])),:where(.btn:is(input[type=radio])){width:auto;-webkit-appearance:none;-moz-appearance:none;appearance:none}.btn:is(input[type=checkbox]):after,.btn:is(input[type=radio]):after{--tw-content: attr(aria-label);content:var(--tw-content)}.chat{display:grid;grid-template-columns:repeat(2,minmax(0,1fr));-moz-column-gap:.75rem;column-gap:.75rem;padding-top:.25rem;padding-bottom:.25rem}.chat-bubble{position:relative;display:block;width:-moz-fit-content;width:fit-content;padding:.5rem 1rem;max-width:90%;border-radius:var(--rounded-box, 1rem);min-height:2.75rem;min-width:2.75rem;--tw-bg-opacity: 1;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-text-opacity: 1;color:var(--fallback-nc,oklch(var(--nc)/var(--tw-text-opacity)))}.chat-bubble:before{position:absolute;bottom:0;height:.75rem;width:.75rem;background-color:inherit;content:"";-webkit-mask-size:contain;mask-size:contain;-webkit-mask-repeat:no-repeat;mask-repeat:no-repeat;-webkit-mask-position:center;mask-position:center}.chat-start{place-items:start;grid-template-columns:auto 1fr}.chat-start .chat-header,.chat-start .chat-footer{grid-column-start:2}.chat-start .chat-image{grid-column-start:1}.chat-start .chat-bubble{grid-column-start:2;border-end-start-radius:0px}.chat-start .chat-bubble:before{-webkit-mask-image:url("data:image/svg+xml,%3csvg width='3' height='3' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m 0 3 L 3 3 L 3 0 C 3 1 1 3 0 3'/%3e%3c/svg%3e");mask-image:url("data:image/svg+xml,%3csvg width='3' height='3' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m 0 3 L 3 3 L 3 0 C 3 1 1 3 0 3'/%3e%3c/svg%3e");inset-inline-start:-.749rem}[dir=rtl] .chat-start .chat-bubble:before{-webkit-mask-image:url("data:image/svg+xml,%3csvg width='3' height='3' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m 0 3 L 1 3 L 3 3 C 2 3 0 1 0 0'/%3e%3c/svg%3e");mask-image:url("data:image/svg+xml,%3csvg width='3' height='3' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m 0 3 L 1 3 L 3 3 C 2 3 0 1 0 0'/%3e%3c/svg%3e")}.chat-end{place-items:end;grid-template-columns:1fr auto}.chat-end .chat-header,.chat-end .chat-footer{grid-column-start:1}.chat-end .chat-image{grid-column-start:2}.chat-end .chat-bubble{grid-column-start:1;border-end-end-radius:0px}.chat-end .chat-bubble:before{-webkit-mask-image:url("data:image/svg+xml,%3csvg width='3' height='3' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m 0 3 L 1 3 L 3 3 C 2 3 0 1 0 0'/%3e%3c/svg%3e");mask-image:url("data:image/svg+xml,%3csvg width='3' height='3' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m 0 3 L 1 3 L 3 3 C 2 3 0 1 0 0'/%3e%3c/svg%3e");inset-inline-start:99.9%}[dir=rtl] .chat-end .chat-bubble:before{-webkit-mask-image:url("data:image/svg+xml,%3csvg width='3' height='3' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m 0 3 L 3 3 L 3 0 C 3 1 1 3 0 3'/%3e%3c/svg%3e");mask-image:url("data:image/svg+xml,%3csvg width='3' height='3' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m 0 3 L 3 3 L 3 0 C 3 1 1 3 0 3'/%3e%3c/svg%3e")}.checkbox{flex-shrink:0;--chkbg: var(--fallback-bc,oklch(var(--bc)/1));--chkfg: var(--fallback-b1,oklch(var(--b1)/1));height:1.5rem;width:1.5rem;cursor:pointer;-webkit-appearance:none;-moz-appearance:none;appearance:none;border-radius:var(--rounded-btn, .5rem);border-width:1px;border-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-border-opacity)));--tw-border-opacity: .2}.collapse:not(td):not(tr):not(colgroup){visibility:visible}.collapse{position:relative;display:grid;overflow:hidden;grid-template-rows:auto 0fr;transition:grid-template-rows .2s;width:100%;border-radius:var(--rounded-box, 1rem)}.collapse-title,.collapse>input[type=checkbox],.collapse>input[type=radio],.collapse-content{grid-column-start:1;grid-row-start:1}.collapse>input[type=checkbox],.collapse>input[type=radio]{-webkit-appearance:none;-moz-appearance:none;appearance:none;opacity:0}.collapse-content{visibility:hidden;grid-column-start:1;grid-row-start:2;min-height:0px;transition:visibility .2s;transition:padding .2s ease-out,background-color .2s ease-out;padding-left:1rem;padding-right:1rem;cursor:unset}.collapse[open],.collapse-open,.collapse:focus:not(.collapse-close){grid-template-rows:auto 1fr}.collapse:not(.collapse-close):has(>input[type=checkbox]:checked),.collapse:not(.collapse-close):has(>input[type=radio]:checked){grid-template-rows:auto 1fr}.collapse[open]>.collapse-content,.collapse-open>.collapse-content,.collapse:focus:not(.collapse-close)>.collapse-content,.collapse:not(.collapse-close)>input[type=checkbox]:checked~.collapse-content,.collapse:not(.collapse-close)>input[type=radio]:checked~.collapse-content{visibility:visible;min-height:-moz-fit-content;min-height:fit-content}.drawer{position:relative;display:grid;grid-auto-columns:max-content auto;width:100%}.drawer-content{grid-column-start:2;grid-row-start:1;min-width:0px}.drawer-side{pointer-events:none;position:fixed;inset-inline-start:0px;top:0;grid-column-start:1;grid-row-start:1;display:grid;width:100%;grid-template-columns:repeat(1,minmax(0,1fr));grid-template-rows:repeat(1,minmax(0,1fr));align-items:flex-start;justify-items:start;overflow-x:hidden;overflow-y:hidden;overscroll-behavior:contain;height:100vh;height:100dvh}.drawer-side>.drawer-overlay{position:sticky;top:0;place-self:stretch;cursor:pointer;background-color:transparent;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-timing-function:cubic-bezier(0,0,.2,1);transition-duration:.2s}.drawer-side>*{grid-column-start:1;grid-row-start:1}.drawer-side>*:not(.drawer-overlay){transition-property:transform;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-timing-function:cubic-bezier(0,0,.2,1);transition-duration:.3s;will-change:transform;transform:translate(-100%)}[dir=rtl] .drawer-side>*:not(.drawer-overlay){transform:translate(100%)}.drawer-toggle{position:fixed;height:0px;width:0px;-webkit-appearance:none;-moz-appearance:none;appearance:none;opacity:0}.drawer-toggle:checked~.drawer-side{pointer-events:auto;visibility:visible;overflow-y:auto}.drawer-toggle:checked~.drawer-side>*:not(.drawer-overlay){transform:translate(0)}.drawer-end>.drawer-toggle~.drawer-content{grid-column-start:1}.drawer-end>.drawer-toggle~.drawer-side{grid-column-start:2;justify-items:end}.drawer-end>.drawer-toggle~.drawer-side>*:not(.drawer-overlay){transform:translate(100%)}[dir=rtl] .drawer-end>.drawer-toggle~.drawer-side>*:not(.drawer-overlay){transform:translate(-100%)}.drawer-end>.drawer-toggle:checked~.drawer-side>*:not(.drawer-overlay){transform:translate(0)}.dropdown{position:relative;display:inline-block}.dropdown>*:not(summary):focus{outline:2px solid transparent;outline-offset:2px}.dropdown .dropdown-content{position:absolute}.dropdown:is(:not(details)) .dropdown-content{visibility:hidden;opacity:0;transform-origin:top;--tw-scale-x: .95;--tw-scale-y: .95;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,-webkit-backdrop-filter;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter,-webkit-backdrop-filter;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-timing-function:cubic-bezier(0,0,.2,1);transition-duration:.2s}.dropdown-end .dropdown-content{inset-inline-end:0px}.dropdown-left .dropdown-content{bottom:auto;inset-inline-end:100%;top:0;transform-origin:right}.dropdown-right .dropdown-content{bottom:auto;inset-inline-start:100%;top:0;transform-origin:left}.dropdown-bottom .dropdown-content{bottom:auto;top:100%;transform-origin:top}.dropdown-top .dropdown-content{bottom:100%;top:auto;transform-origin:bottom}.dropdown-end.dropdown-right .dropdown-content,.dropdown-end.dropdown-left .dropdown-content{bottom:0;top:auto}.dropdown.dropdown-open .dropdown-content,.dropdown:not(.dropdown-hover):focus .dropdown-content,.dropdown:focus-within .dropdown-content{visibility:visible;opacity:1}@media (hover: hover){.dropdown.dropdown-hover:hover .dropdown-content{visibility:visible;opacity:1}.btm-nav>*.disabled:hover,.btm-nav>*[disabled]:hover{pointer-events:none;--tw-border-opacity: 0;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-bg-opacity: .1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));--tw-text-opacity: .2}.btn:hover{--tw-border-opacity: 1;border-color:var(--fallback-b3,oklch(var(--b3)/var(--tw-border-opacity)));--tw-bg-opacity: 1;background-color:var(--fallback-b3,oklch(var(--b3)/var(--tw-bg-opacity)))}@supports (color: color-mix(in oklab,black,black)){.btn:hover{background-color:color-mix(in oklab,oklch(var(--btn-color, var(--b2)) / var(--tw-bg-opacity, 1)) 90%,black);border-color:color-mix(in oklab,oklch(var(--btn-color, var(--b2)) / var(--tw-border-opacity, 1)) 90%,black)}}@supports not (color: oklch(0% 0 0)){.btn:hover{background-color:var(--btn-color, var(--fallback-b2));border-color:var(--btn-color, var(--fallback-b2))}}.btn.glass:hover{--glass-opacity: 25%;--glass-border-opacity: 15%}.btn-ghost:hover{border-color:transparent}@supports (color: oklch(0% 0 0)){.btn-ghost:hover{background-color:var(--fallback-bc,oklch(var(--bc)/.2))}}.btn-outline.btn-primary:hover{--tw-text-opacity: 1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)))}@supports (color: color-mix(in oklab,black,black)){.btn-outline.btn-primary:hover{background-color:color-mix(in oklab,var(--fallback-p,oklch(var(--p)/1)) 90%,black);border-color:color-mix(in oklab,var(--fallback-p,oklch(var(--p)/1)) 90%,black)}}.btn-disabled:hover,.btn[disabled]:hover,.btn:disabled:hover{--tw-border-opacity: 0;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-bg-opacity: .2;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));--tw-text-opacity: .2}@supports (color: color-mix(in oklab,black,black)){.btn:is(input[type=checkbox]:checked):hover,.btn:is(input[type=radio]:checked):hover{background-color:color-mix(in oklab,var(--fallback-p,oklch(var(--p)/1)) 90%,black);border-color:color-mix(in oklab,var(--fallback-p,oklch(var(--p)/1)) 90%,black)}}.dropdown.dropdown-hover:hover .dropdown-content{--tw-scale-x: 1;--tw-scale-y: 1;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}:where(.menu li:not(.menu-title,.disabled)>*:not(ul,details,.menu-title)):not(.active,.btn):hover,:where(.menu li:not(.menu-title,.disabled)>details>summary:not(.menu-title)):not(.active,.btn):hover{cursor:pointer;outline:2px solid transparent;outline-offset:2px}@supports (color: oklch(0% 0 0)){:where(.menu li:not(.menu-title,.disabled)>*:not(ul,details,.menu-title)):not(.active,.btn):hover,:where(.menu li:not(.menu-title,.disabled)>details>summary:not(.menu-title)):not(.active,.btn):hover{background-color:var(--fallback-bc,oklch(var(--bc)/.1))}}}.dropdown:is(details) summary::-webkit-details-marker{display:none}.form-control{display:flex;flex-direction:column}.label{display:flex;-webkit-user-select:none;-moz-user-select:none;user-select:none;align-items:center;justify-content:space-between;padding:.5rem .25rem}.input{flex-shrink:1;-webkit-appearance:none;-moz-appearance:none;appearance:none;height:3rem;padding-left:1rem;padding-right:1rem;font-size:1rem;line-height:2;line-height:1.5rem;border-radius:var(--rounded-btn, .5rem);border-width:1px;border-color:transparent;--tw-bg-opacity: 1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)))}.input[type=number]::-webkit-inner-spin-button,.input-md[type=number]::-webkit-inner-spin-button{margin-top:-1rem;margin-bottom:-1rem;margin-inline-end:-1rem}.join :where(.join-item){border-start-end-radius:0;border-end-end-radius:0;border-end-start-radius:0;border-start-start-radius:0}.join .join-item:not(:first-child):not(:last-child),.join *:not(:first-child):not(:last-child) .join-item{border-start-end-radius:0;border-end-end-radius:0;border-end-start-radius:0;border-start-start-radius:0}.join .join-item:first-child:not(:last-child),.join *:first-child:not(:last-child) .join-item{border-start-end-radius:0;border-end-end-radius:0}.join .dropdown .join-item:first-child:not(:last-child),.join *:first-child:not(:last-child) .dropdown .join-item{border-start-end-radius:inherit;border-end-end-radius:inherit}.join :where(.join-item:first-child:not(:last-child)),.join :where(*:first-child:not(:last-child) .join-item){border-end-start-radius:inherit;border-start-start-radius:inherit}.join .join-item:last-child:not(:first-child),.join *:last-child:not(:first-child) .join-item{border-end-start-radius:0;border-start-start-radius:0}.join :where(.join-item:last-child:not(:first-child)),.join :where(*:last-child:not(:first-child) .join-item){border-start-end-radius:inherit;border-end-end-radius:inherit}@supports selector(:has(*)){:where(.join *:has(.join-item)){border-radius:inherit}}.menu{display:flex;flex-direction:column;flex-wrap:wrap;font-size:.875rem;line-height:1.25rem;padding:.5rem}.menu :where(li ul){position:relative;white-space:nowrap;margin-inline-start:1rem;padding-inline-start:.5rem}.menu :where(li:not(.menu-title)>*:not(ul,details,.menu-title,.btn)),.menu :where(li:not(.menu-title)>details>summary:not(.menu-title)){display:grid;grid-auto-flow:column;align-content:flex-start;align-items:center;gap:.5rem;grid-auto-columns:minmax(auto,max-content) auto max-content;-webkit-user-select:none;-moz-user-select:none;user-select:none}.menu li.disabled{cursor:not-allowed;-webkit-user-select:none;-moz-user-select:none;user-select:none;color:var(--fallback-bc,oklch(var(--bc)/.3))}.menu :where(li>.menu-dropdown:not(.menu-dropdown-show)){display:none}:where(.menu li){position:relative;display:flex;flex-shrink:0;flex-direction:column;flex-wrap:wrap;align-items:stretch}:where(.menu li) .badge{justify-self:end}.modal{pointer-events:none;position:fixed;top:0;right:0;bottom:0;left:0;margin:0;display:grid;height:100%;max-height:none;width:100%;max-width:none;justify-items:center;padding:0;opacity:0;overscroll-behavior:contain;z-index:999;background-color:transparent;color:inherit;transition-duration:.2s;transition-timing-function:cubic-bezier(0,0,.2,1);transition-property:transform,opacity,visibility;overflow-y:hidden}:where(.modal){align-items:center}.modal-box{max-height:calc(100vh - 5em);grid-column-start:1;grid-row-start:1;width:91.666667%;max-width:32rem;--tw-scale-x: .9;--tw-scale-y: .9;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));border-bottom-right-radius:var(--rounded-box, 1rem);border-bottom-left-radius:var(--rounded-box, 1rem);border-top-left-radius:var(--rounded-box, 1rem);border-top-right-radius:var(--rounded-box, 1rem);--tw-bg-opacity: 1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)));padding:1.5rem;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,-webkit-backdrop-filter;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter,-webkit-backdrop-filter;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-timing-function:cubic-bezier(0,0,.2,1);transition-duration:.2s;box-shadow:#00000040 0 25px 50px -12px;overflow-y:auto;overscroll-behavior:contain}.modal-open,.modal:target,.modal-toggle:checked+.modal,.modal[open]{pointer-events:auto;visibility:visible;opacity:1}.modal-action{display:flex;margin-top:1.5rem;justify-content:flex-end}:root:has(:is(.modal-open,.modal:target,.modal-toggle:checked+.modal,.modal[open])){overflow:hidden;scrollbar-gutter:stable}.radio{flex-shrink:0;--chkbg: var(--bc);height:1.5rem;width:1.5rem;cursor:pointer;-webkit-appearance:none;-moz-appearance:none;appearance:none;border-radius:9999px;border-width:1px;border-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-border-opacity)));--tw-border-opacity: .2}.range{height:1.5rem;width:100%;cursor:pointer;-moz-appearance:none;appearance:none;-webkit-appearance:none;--range-shdw: var(--fallback-bc,oklch(var(--bc)/1));overflow:hidden;border-radius:var(--rounded-box, 1rem);background-color:transparent}.range:focus{outline:none}.textarea{min-height:3rem;flex-shrink:1;padding:.5rem 1rem;font-size:.875rem;line-height:1.25rem;line-height:2;border-radius:var(--rounded-btn, .5rem);border-width:1px;border-color:transparent;--tw-bg-opacity: 1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)))}.btm-nav>*.disabled,.btm-nav>*[disabled]{pointer-events:none;--tw-border-opacity: 0;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-bg-opacity: .1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));--tw-text-opacity: .2}.btm-nav>* .label{font-size:1rem;line-height:1.5rem}@media (prefers-reduced-motion: no-preference){.btn{animation:button-pop var(--animation-btn, .25s) ease-out}}.btn:active:hover,.btn:active:focus{animation:button-pop 0s ease-out;transform:scale(var(--btn-focus-scale, .97))}@supports not (color: oklch(0% 0 0)){.btn{background-color:var(--btn-color, var(--fallback-b2));border-color:var(--btn-color, var(--fallback-b2))}.btn-primary{--btn-color: var(--fallback-p)}.btn-neutral{--btn-color: var(--fallback-n)}}@supports (color: color-mix(in oklab,black,black)){.btn-active{background-color:color-mix(in oklab,oklch(var(--btn-color, var(--b3)) / var(--tw-bg-opacity, 1)) 90%,black);border-color:color-mix(in oklab,oklch(var(--btn-color, var(--b3)) / var(--tw-border-opacity, 1)) 90%,black)}.btn-outline.btn-primary.btn-active{background-color:color-mix(in oklab,var(--fallback-p,oklch(var(--p)/1)) 90%,black);border-color:color-mix(in oklab,var(--fallback-p,oklch(var(--p)/1)) 90%,black)}.btn-outline.btn-secondary.btn-active{background-color:color-mix(in oklab,var(--fallback-s,oklch(var(--s)/1)) 90%,black);border-color:color-mix(in oklab,var(--fallback-s,oklch(var(--s)/1)) 90%,black)}.btn-outline.btn-accent.btn-active{background-color:color-mix(in oklab,var(--fallback-a,oklch(var(--a)/1)) 90%,black);border-color:color-mix(in oklab,var(--fallback-a,oklch(var(--a)/1)) 90%,black)}.btn-outline.btn-success.btn-active{background-color:color-mix(in oklab,var(--fallback-su,oklch(var(--su)/1)) 90%,black);border-color:color-mix(in oklab,var(--fallback-su,oklch(var(--su)/1)) 90%,black)}.btn-outline.btn-info.btn-active{background-color:color-mix(in oklab,var(--fallback-in,oklch(var(--in)/1)) 90%,black);border-color:color-mix(in oklab,var(--fallback-in,oklch(var(--in)/1)) 90%,black)}.btn-outline.btn-warning.btn-active{background-color:color-mix(in oklab,var(--fallback-wa,oklch(var(--wa)/1)) 90%,black);border-color:color-mix(in oklab,var(--fallback-wa,oklch(var(--wa)/1)) 90%,black)}.btn-outline.btn-error.btn-active{background-color:color-mix(in oklab,var(--fallback-er,oklch(var(--er)/1)) 90%,black);border-color:color-mix(in oklab,var(--fallback-er,oklch(var(--er)/1)) 90%,black)}}.btn:focus-visible{outline-style:solid;outline-width:2px;outline-offset:2px}.btn-primary{--tw-text-opacity: 1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)));outline-color:var(--fallback-p,oklch(var(--p)/1))}@supports (color: oklch(0% 0 0)){.btn-primary{--btn-color: var(--p)}.btn-neutral{--btn-color: var(--n)}}.btn-neutral{--tw-text-opacity: 1;color:var(--fallback-nc,oklch(var(--nc)/var(--tw-text-opacity)));outline-color:var(--fallback-n,oklch(var(--n)/1))}.btn.glass{--tw-shadow: 0 0 #0000;--tw-shadow-colored: 0 0 #0000;box-shadow:var(--tw-ring-offset-shadow, 0 0 #0000),var(--tw-ring-shadow, 0 0 #0000),var(--tw-shadow);outline-color:currentColor}.btn.glass.btn-active{--glass-opacity: 25%;--glass-border-opacity: 15%}.btn-ghost{border-width:1px;border-color:transparent;background-color:transparent;color:currentColor;--tw-shadow: 0 0 #0000;--tw-shadow-colored: 0 0 #0000;box-shadow:var(--tw-ring-offset-shadow, 0 0 #0000),var(--tw-ring-shadow, 0 0 #0000),var(--tw-shadow);outline-color:currentColor}.btn-ghost.btn-active{border-color:transparent;background-color:var(--fallback-bc,oklch(var(--bc)/.2))}.btn-link.btn-active{border-color:transparent;background-color:transparent;text-decoration-line:underline}.btn-outline.btn-active{--tw-border-opacity: 1;border-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-border-opacity)));--tw-bg-opacity: 1;background-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)));--tw-text-opacity: 1;color:var(--fallback-b1,oklch(var(--b1)/var(--tw-text-opacity)))}.btn-outline.btn-primary{--tw-text-opacity: 1;color:var(--fallback-p,oklch(var(--p)/var(--tw-text-opacity)))}.btn-outline.btn-primary.btn-active{--tw-text-opacity: 1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)))}.btn-outline.btn-secondary.btn-active{--tw-text-opacity: 1;color:var(--fallback-sc,oklch(var(--sc)/var(--tw-text-opacity)))}.btn-outline.btn-accent.btn-active{--tw-text-opacity: 1;color:var(--fallback-ac,oklch(var(--ac)/var(--tw-text-opacity)))}.btn-outline.btn-success.btn-active{--tw-text-opacity: 1;color:var(--fallback-suc,oklch(var(--suc)/var(--tw-text-opacity)))}.btn-outline.btn-info.btn-active{--tw-text-opacity: 1;color:var(--fallback-inc,oklch(var(--inc)/var(--tw-text-opacity)))}.btn-outline.btn-warning.btn-active{--tw-text-opacity: 1;color:var(--fallback-wac,oklch(var(--wac)/var(--tw-text-opacity)))}.btn-outline.btn-error.btn-active{--tw-text-opacity: 1;color:var(--fallback-erc,oklch(var(--erc)/var(--tw-text-opacity)))}.btn.btn-disabled,.btn[disabled],.btn:disabled{--tw-border-opacity: 0;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-bg-opacity: .2;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));--tw-text-opacity: .2}.btn:is(input[type=checkbox]:checked),.btn:is(input[type=radio]:checked){--tw-border-opacity: 1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)));--tw-bg-opacity: 1;background-color:var(--fallback-p,oklch(var(--p)/var(--tw-bg-opacity)));--tw-text-opacity: 1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)))}.btn:is(input[type=checkbox]:checked):focus-visible,.btn:is(input[type=radio]:checked):focus-visible{outline-color:var(--fallback-p,oklch(var(--p)/1))}@keyframes button-pop{0%{transform:scale(var(--btn-focus-scale, .98))}40%{transform:scale(1.02)}to{transform:scale(1)}}.checkbox:focus{box-shadow:none}.checkbox:focus-visible{outline-style:solid;outline-width:2px;outline-offset:2px;outline-color:var(--fallback-bc,oklch(var(--bc)/1))}.checkbox:disabled{border-width:0px;cursor:not-allowed;border-color:transparent;--tw-bg-opacity: 1;background-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)));opacity:.2}.checkbox:checked,.checkbox[aria-checked=true]{background-repeat:no-repeat;animation:checkmark var(--animation-input, .2s) ease-out;background-color:var(--chkbg);background-image:linear-gradient(-45deg,transparent 65%,var(--chkbg) 65.99%),linear-gradient(45deg,transparent 75%,var(--chkbg) 75.99%),linear-gradient(-45deg,var(--chkbg) 40%,transparent 40.99%),linear-gradient(45deg,var(--chkbg) 30%,var(--chkfg) 30.99%,var(--chkfg) 40%,transparent 40.99%),linear-gradient(-45deg,var(--chkfg) 50%,var(--chkbg) 50.99%)}.checkbox:indeterminate{--tw-bg-opacity: 1;background-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)));background-repeat:no-repeat;animation:checkmark var(--animation-input, .2s) ease-out;background-image:linear-gradient(90deg,transparent 80%,var(--chkbg) 80%),linear-gradient(-90deg,transparent 80%,var(--chkbg) 80%),linear-gradient(0deg,var(--chkbg) 43%,var(--chkfg) 43%,var(--chkfg) 57%,var(--chkbg) 57%)}@keyframes checkmark{0%{background-position-y:5px}50%{background-position-y:-2px}to{background-position-y:0}}details.collapse{width:100%}details.collapse summary{position:relative;display:block;outline:2px solid transparent;outline-offset:2px}details.collapse summary::-webkit-details-marker{display:none}.collapse:focus-visible{outline-style:solid;outline-width:2px;outline-offset:2px;outline-color:var(--fallback-bc,oklch(var(--bc)/1))}.collapse:has(.collapse-title:focus-visible),.collapse:has(>input[type=checkbox]:focus-visible),.collapse:has(>input[type=radio]:focus-visible){outline-style:solid;outline-width:2px;outline-offset:2px;outline-color:var(--fallback-bc,oklch(var(--bc)/1))}.collapse-arrow>.collapse-title:after{position:absolute;display:block;height:.5rem;width:.5rem;--tw-translate-y: -100%;--tw-rotate: 45deg;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));transition-property:all;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-timing-function:cubic-bezier(0,0,.2,1);transition-duration:.15s;transition-duration:.2s;top:1.9rem;inset-inline-end:1.4rem;content:"";transform-origin:75% 75%;box-shadow:2px 2px;pointer-events:none}.collapse-plus>.collapse-title:after{position:absolute;display:block;height:.5rem;width:.5rem;transition-property:all;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-timing-function:cubic-bezier(0,0,.2,1);transition-duration:.3s;top:.9rem;inset-inline-end:1.4rem;content:"+";pointer-events:none}.collapse:not(.collapse-open):not(.collapse-close)>input[type=checkbox],.collapse:not(.collapse-open):not(.collapse-close)>input[type=radio]:not(:checked),.collapse:not(.collapse-open):not(.collapse-close)>.collapse-title{cursor:pointer}.collapse:focus:not(.collapse-open):not(.collapse-close):not(.collapse[open])>.collapse-title{cursor:unset}.collapse-title{position:relative}:where(.collapse>input[type=checkbox]),:where(.collapse>input[type=radio]){z-index:1}.collapse-title,:where(.collapse>input[type=checkbox]),:where(.collapse>input[type=radio]){width:100%;padding:1rem;padding-inline-end:3rem;min-height:3.75rem;transition:background-color .2s ease-out}.collapse[open]>:where(.collapse-content),.collapse-open>:where(.collapse-content),.collapse:focus:not(.collapse-close)>:where(.collapse-content),.collapse:not(.collapse-close)>:where(input[type=checkbox]:checked~.collapse-content),.collapse:not(.collapse-close)>:where(input[type=radio]:checked~.collapse-content){padding-bottom:1rem;transition:padding .2s ease-out,background-color .2s ease-out}.collapse[open].collapse-arrow>.collapse-title:after,.collapse-open.collapse-arrow>.collapse-title:after,.collapse-arrow:focus:not(.collapse-close)>.collapse-title:after,.collapse-arrow:not(.collapse-close)>input[type=checkbox]:checked~.collapse-title:after,.collapse-arrow:not(.collapse-close)>input[type=radio]:checked~.collapse-title:after{--tw-translate-y: -50%;--tw-rotate: 225deg;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.collapse[open].collapse-plus>.collapse-title:after,.collapse-open.collapse-plus>.collapse-title:after,.collapse-plus:focus:not(.collapse-close)>.collapse-title:after,.collapse-plus:not(.collapse-close)>input[type=checkbox]:checked~.collapse-title:after,.collapse-plus:not(.collapse-close)>input[type=radio]:checked~.collapse-title:after{content:"−"}.drawer-toggle:checked~.drawer-side>.drawer-overlay{background-color:#0006}.drawer-toggle:focus-visible~.drawer-content label.drawer-button{outline-style:solid;outline-width:2px;outline-offset:2px}.dropdown.dropdown-open .dropdown-content,.dropdown:focus .dropdown-content,.dropdown:focus-within .dropdown-content{--tw-scale-x: 1;--tw-scale-y: 1;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.input input{--tw-bg-opacity: 1;background-color:var(--fallback-p,oklch(var(--p)/var(--tw-bg-opacity)));background-color:transparent}.input input:focus{outline:2px solid transparent;outline-offset:2px}.input[list]::-webkit-calendar-picker-indicator{line-height:1em}.input-bordered{border-color:var(--fallback-bc,oklch(var(--bc)/.2))}.input:focus,.input:focus-within{box-shadow:none;border-color:var(--fallback-bc,oklch(var(--bc)/.2));outline-style:solid;outline-width:2px;outline-offset:2px;outline-color:var(--fallback-bc,oklch(var(--bc)/.2))}.input:has(>input[disabled]),.input-disabled,.input:disabled,.input[disabled]{cursor:not-allowed;--tw-border-opacity: 1;border-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-border-opacity)));--tw-bg-opacity: 1;background-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-bg-opacity)));color:var(--fallback-bc,oklch(var(--bc)/.4))}.input:has(>input[disabled])::-moz-placeholder,.input-disabled::-moz-placeholder,.input:disabled::-moz-placeholder,.input[disabled]::-moz-placeholder{color:var(--fallback-bc,oklch(var(--bc)/var(--tw-placeholder-opacity)));--tw-placeholder-opacity: .2}.input:has(>input[disabled])::placeholder,.input-disabled::placeholder,.input:disabled::placeholder,.input[disabled]::placeholder{color:var(--fallback-bc,oklch(var(--bc)/var(--tw-placeholder-opacity)));--tw-placeholder-opacity: .2}.input:has(>input[disabled])>input[disabled]{cursor:not-allowed}.input::-webkit-date-and-time-value{text-align:inherit}.join>:where(*:not(:first-child)):is(.btn){margin-inline-start:calc(var(--border-btn) * -1)}.join-item:focus{isolation:isolate}.loading{pointer-events:none;display:inline-block;aspect-ratio:1 / 1;width:1.5rem;background-color:currentColor;-webkit-mask-size:100%;mask-size:100%;-webkit-mask-repeat:no-repeat;mask-repeat:no-repeat;-webkit-mask-position:center;mask-position:center;-webkit-mask-image:url("data:image/svg+xml,%3Csvg width='24' height='24' stroke='%23000' viewBox='0 0 24 24' xmlns='http://www.w3.org/2000/svg'%3E%3Cstyle%3E.spinner_V8m1%7Btransform-origin:center;animation:spinner_zKoa 2s linear infinite%7D.spinner_V8m1 circle%7Bstroke-linecap:round;animation:spinner_YpZS 1.5s ease-out infinite%7D%40keyframes spinner_zKoa%7B100%25%7Btransform:rotate(360deg)%7D%7D%40keyframes spinner_YpZS%7B0%25%7Bstroke-dasharray:0 150;stroke-dashoffset:0%7D47.5%25%7Bstroke-dasharray:42 150;stroke-dashoffset:-16%7D95%25%2C100%25%7Bstroke-dasharray:42 150;stroke-dashoffset:-59%7D%7D%3C%2Fstyle%3E%3Cg class='spinner_V8m1'%3E%3Ccircle cx='12' cy='12' r='9.5' fill='none' stroke-width='3'%3E%3C%2Fcircle%3E%3C%2Fg%3E%3C%2Fsvg%3E");mask-image:url("data:image/svg+xml,%3Csvg width='24' height='24' stroke='%23000' viewBox='0 0 24 24' xmlns='http://www.w3.org/2000/svg'%3E%3Cstyle%3E.spinner_V8m1%7Btransform-origin:center;animation:spinner_zKoa 2s linear infinite%7D.spinner_V8m1 circle%7Bstroke-linecap:round;animation:spinner_YpZS 1.5s ease-out infinite%7D%40keyframes spinner_zKoa%7B100%25%7Btransform:rotate(360deg)%7D%7D%40keyframes spinner_YpZS%7B0%25%7Bstroke-dasharray:0 150;stroke-dashoffset:0%7D47.5%25%7Bstroke-dasharray:42 150;stroke-dashoffset:-16%7D95%25%2C100%25%7Bstroke-dasharray:42 150;stroke-dashoffset:-59%7D%7D%3C%2Fstyle%3E%3Cg class='spinner_V8m1'%3E%3Ccircle cx='12' cy='12' r='9.5' fill='none' stroke-width='3'%3E%3C%2Fcircle%3E%3C%2Fg%3E%3C%2Fsvg%3E")}.loading-dots{-webkit-mask-image:url("data:image/svg+xml,%3Csvg width='24' height='24' viewBox='0 0 24 24' xmlns='http://www.w3.org/2000/svg'%3E%3Cstyle%3E.spinner_qM83%7Banimation:spinner_8HQG 1.05s infinite%7D.spinner_oXPr%7Banimation-delay:.1s%7D.spinner_ZTLf%7Banimation-delay:.2s%7D@keyframes spinner_8HQG%7B0%25,57.14%25%7Banimation-timing-function:cubic-bezier(0.33,.66,.66,1);transform:translate(0)%7D28.57%25%7Banimation-timing-function:cubic-bezier(0.33,0,.66,.33);transform:translateY(-6px)%7D100%25%7Btransform:translate(0)%7D%7D%3C/style%3E%3Ccircle class='spinner_qM83' cx='4' cy='12' r='3'/%3E%3Ccircle class='spinner_qM83 spinner_oXPr' cx='12' cy='12' r='3'/%3E%3Ccircle class='spinner_qM83 spinner_ZTLf' cx='20' cy='12' r='3'/%3E%3C/svg%3E");mask-image:url("data:image/svg+xml,%3Csvg width='24' height='24' viewBox='0 0 24 24' xmlns='http://www.w3.org/2000/svg'%3E%3Cstyle%3E.spinner_qM83%7Banimation:spinner_8HQG 1.05s infinite%7D.spinner_oXPr%7Banimation-delay:.1s%7D.spinner_ZTLf%7Banimation-delay:.2s%7D@keyframes spinner_8HQG%7B0%25,57.14%25%7Banimation-timing-function:cubic-bezier(0.33,.66,.66,1);transform:translate(0)%7D28.57%25%7Banimation-timing-function:cubic-bezier(0.33,0,.66,.33);transform:translateY(-6px)%7D100%25%7Btransform:translate(0)%7D%7D%3C/style%3E%3Ccircle class='spinner_qM83' cx='4' cy='12' r='3'/%3E%3Ccircle class='spinner_qM83 spinner_oXPr' cx='12' cy='12' r='3'/%3E%3Ccircle class='spinner_qM83 spinner_ZTLf' cx='20' cy='12' r='3'/%3E%3C/svg%3E")}.loading-md{width:1.5rem}:where(.menu li:empty){--tw-bg-opacity: 1;background-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)));opacity:.1;margin:.5rem 1rem;height:1px}.menu :where(li ul):before{position:absolute;bottom:.75rem;inset-inline-start:0px;top:.75rem;width:1px;--tw-bg-opacity: 1;background-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)));opacity:.1;content:""}.menu :where(li:not(.menu-title)>*:not(ul,details,.menu-title,.btn)),.menu :where(li:not(.menu-title)>details>summary:not(.menu-title)){border-radius:var(--rounded-btn, .5rem);padding:.5rem 1rem;text-align:start;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,-webkit-backdrop-filter;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter,-webkit-backdrop-filter;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-timing-function:cubic-bezier(0,0,.2,1);transition-duration:.2s;text-wrap:balance}:where(.menu li:not(.menu-title,.disabled)>*:not(ul,details,.menu-title)):not(summary,.active,.btn).focus,:where(.menu li:not(.menu-title,.disabled)>*:not(ul,details,.menu-title)):not(summary,.active,.btn):focus,:where(.menu li:not(.menu-title,.disabled)>*:not(ul,details,.menu-title)):is(summary):not(.active,.btn):focus-visible,:where(.menu li:not(.menu-title,.disabled)>details>summary:not(.menu-title)):not(summary,.active,.btn).focus,:where(.menu li:not(.menu-title,.disabled)>details>summary:not(.menu-title)):not(summary,.active,.btn):focus,:where(.menu li:not(.menu-title,.disabled)>details>summary:not(.menu-title)):is(summary):not(.active,.btn):focus-visible{cursor:pointer;background-color:var(--fallback-bc,oklch(var(--bc)/.1));--tw-text-opacity: 1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));outline:2px solid transparent;outline-offset:2px}.menu li>*:not(ul,.menu-title,details,.btn):active,.menu li>*:not(ul,.menu-title,details,.btn).active,.menu li>details>summary:active{--tw-bg-opacity: 1;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-text-opacity: 1;color:var(--fallback-nc,oklch(var(--nc)/var(--tw-text-opacity)))}.menu :where(li>details>summary)::-webkit-details-marker{display:none}.menu :where(li>details>summary):after,.menu :where(li>.menu-dropdown-toggle):after{justify-self:end;display:block;margin-top:-.5rem;height:.5rem;width:.5rem;transform:rotate(45deg);transition-property:transform,margin-top;transition-duration:.3s;transition-timing-function:cubic-bezier(.4,0,.2,1);content:"";transform-origin:75% 75%;box-shadow:2px 2px;pointer-events:none}.menu :where(li>details[open]>summary):after,.menu :where(li>.menu-dropdown-toggle.menu-dropdown-show):after{transform:rotate(225deg);margin-top:0}.mockup-browser .mockup-browser-toolbar .input{position:relative;margin-left:auto;margin-right:auto;display:block;height:1.75rem;width:24rem;overflow:hidden;text-overflow:ellipsis;white-space:nowrap;--tw-bg-opacity: 1;background-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-bg-opacity)));padding-left:2rem;direction:ltr}.mockup-browser .mockup-browser-toolbar .input:before{content:"";position:absolute;left:.5rem;top:50%;aspect-ratio:1 / 1;height:.75rem;--tw-translate-y: -50%;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));border-radius:9999px;border-width:2px;border-color:currentColor;opacity:.6}.mockup-browser .mockup-browser-toolbar .input:after{content:"";position:absolute;left:1.25rem;top:50%;height:.5rem;--tw-translate-y: 25%;--tw-rotate: -45deg;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));border-radius:9999px;border-width:1px;border-color:currentColor;opacity:.6}.modal:not(dialog:not(.modal-open)),.modal::backdrop{background-color:#0006;animation:modal-pop .2s ease-out}.modal-open .modal-box,.modal-toggle:checked+.modal .modal-box,.modal:target .modal-box,.modal[open] .modal-box{--tw-translate-y: 0px;--tw-scale-x: 1;--tw-scale-y: 1;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.modal-action>:not([hidden])~:not([hidden]){--tw-space-x-reverse: 0;margin-right:calc(.5rem * var(--tw-space-x-reverse));margin-left:calc(.5rem * calc(1 - var(--tw-space-x-reverse)))}@keyframes modal-pop{0%{opacity:0}}@keyframes progress-loading{50%{background-position-x:-115%}}.radio:focus{box-shadow:none}.radio:focus-visible{outline-style:solid;outline-width:2px;outline-offset:2px;outline-color:var(--fallback-bc,oklch(var(--bc)/1))}.radio:checked,.radio[aria-checked=true]{--tw-bg-opacity: 1;background-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)));background-image:none;animation:radiomark var(--animation-input, .2s) ease-out;box-shadow:0 0 0 4px var(--fallback-b1,oklch(var(--b1)/1)) inset,0 0 0 4px var(--fallback-b1,oklch(var(--b1)/1)) inset}.radio:disabled{cursor:not-allowed;opacity:.2}@keyframes radiomark{0%{box-shadow:0 0 0 12px var(--fallback-b1,oklch(var(--b1)/1)) inset,0 0 0 12px var(--fallback-b1,oklch(var(--b1)/1)) inset}50%{box-shadow:0 0 0 3px var(--fallback-b1,oklch(var(--b1)/1)) inset,0 0 0 3px var(--fallback-b1,oklch(var(--b1)/1)) inset}to{box-shadow:0 0 0 4px var(--fallback-b1,oklch(var(--b1)/1)) inset,0 0 0 4px var(--fallback-b1,oklch(var(--b1)/1)) inset}}.range:focus-visible::-webkit-slider-thumb{--focus-shadow: 0 0 0 6px var(--fallback-b1,oklch(var(--b1)/1)) inset, 0 0 0 2rem var(--range-shdw) inset}.range:focus-visible::-moz-range-thumb{--focus-shadow: 0 0 0 6px var(--fallback-b1,oklch(var(--b1)/1)) inset, 0 0 0 2rem var(--range-shdw) inset}.range::-webkit-slider-runnable-track{height:.5rem;width:100%;border-radius:var(--rounded-box, 1rem);background-color:var(--fallback-bc,oklch(var(--bc)/.1))}.range::-moz-range-track{height:.5rem;width:100%;border-radius:var(--rounded-box, 1rem);background-color:var(--fallback-bc,oklch(var(--bc)/.1))}.range::-webkit-slider-thumb{position:relative;height:1.5rem;width:1.5rem;border-radius:var(--rounded-box, 1rem);border-style:none;--tw-bg-opacity: 1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)));-moz-appearance:none;appearance:none;-webkit-appearance:none;top:50%;color:var(--range-shdw);transform:translateY(-50%);--filler-size: 100rem;--filler-offset: .6rem;box-shadow:0 0 0 3px var(--range-shdw) inset,var(--focus-shadow, 0 0),calc(var(--filler-size) * -1 - var(--filler-offset)) 0 0 var(--filler-size)}.range::-moz-range-thumb{position:relative;height:1.5rem;width:1.5rem;border-radius:var(--rounded-box, 1rem);border-style:none;--tw-bg-opacity: 1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)));top:50%;color:var(--range-shdw);--filler-size: 100rem;--filler-offset: .5rem;box-shadow:0 0 0 3px var(--range-shdw) inset,var(--focus-shadow, 0 0),calc(var(--filler-size) * -1 - var(--filler-offset)) 0 0 var(--filler-size)}@keyframes rating-pop{0%{transform:translateY(-.125em)}40%{transform:translateY(-.125em)}to{transform:translateY(0)}}@keyframes skeleton{0%{background-position:150%}to{background-position:-50%}}.textarea-bordered{border-color:var(--fallback-bc,oklch(var(--bc)/.2))}.textarea:focus{box-shadow:none;border-color:var(--fallback-bc,oklch(var(--bc)/.2));outline-style:solid;outline-width:2px;outline-offset:2px;outline-color:var(--fallback-bc,oklch(var(--bc)/.2))}.textarea-disabled,.textarea:disabled,.textarea[disabled]{cursor:not-allowed;--tw-border-opacity: 1;border-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-border-opacity)));--tw-bg-opacity: 1;background-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-bg-opacity)));color:var(--fallback-bc,oklch(var(--bc)/.4))}.textarea-disabled::-moz-placeholder,.textarea:disabled::-moz-placeholder,.textarea[disabled]::-moz-placeholder{color:var(--fallback-bc,oklch(var(--bc)/var(--tw-placeholder-opacity)));--tw-placeholder-opacity: .2}.textarea-disabled::placeholder,.textarea:disabled::placeholder,.textarea[disabled]::placeholder{color:var(--fallback-bc,oklch(var(--bc)/var(--tw-placeholder-opacity)));--tw-placeholder-opacity: .2}@keyframes toast-pop{0%{transform:scale(.9);opacity:0}to{transform:scale(1);opacity:1}}.glass,.glass.btn-active{border:none;-webkit-backdrop-filter:blur(var(--glass-blur, 40px));backdrop-filter:blur(var(--glass-blur, 40px));background-color:transparent;background-image:linear-gradient(135deg,rgb(255 255 255 / var(--glass-opacity, 30%)),#0000),linear-gradient(var(--glass-reflex-degree, 100deg),rgb(255 255 255 / var(--glass-reflex-opacity, 10%)) 25%,rgb(0 0 0 / 0%) 25%);box-shadow:0 0 0 1px rgb(255 255 255 / var(--glass-border-opacity, 10%)) inset,0 0 0 2px #0000000d;text-shadow:0 1px rgb(0 0 0 / var(--glass-text-shadow-opacity, 5%))}@media (hover: hover){.glass.btn-active{border:none;-webkit-backdrop-filter:blur(var(--glass-blur, 40px));backdrop-filter:blur(var(--glass-blur, 40px));background-color:transparent;background-image:linear-gradient(135deg,rgb(255 255 255 / var(--glass-opacity, 30%)),#0000),linear-gradient(var(--glass-reflex-degree, 100deg),rgb(255 255 255 / var(--glass-reflex-opacity, 10%)) 25%,rgb(0 0 0 / 0%) 25%);box-shadow:0 0 0 1px rgb(255 255 255 / var(--glass-border-opacity, 10%)) inset,0 0 0 2px #0000000d;text-shadow:0 1px rgb(0 0 0 / var(--glass-text-shadow-opacity, 5%))}}.btn-sm{height:2rem;min-height:2rem;padding-left:.75rem;padding-right:.75rem;font-size:.875rem}.btn-block{width:100%}.btn-square:where(.btn-sm){height:2rem;width:2rem;padding:0}.btn-circle:where(.btn-sm){height:2rem;width:2rem;border-radius:9999px;padding:0}.drawer-open>.drawer-toggle{display:none}.drawer-open>.drawer-toggle~.drawer-side{pointer-events:auto;visibility:visible;position:sticky;display:block;width:auto;overscroll-behavior:auto}.drawer-open>.drawer-toggle~.drawer-side>*:not(.drawer-overlay){transform:translate(0)}[dir=rtl] .drawer-open>.drawer-toggle~.drawer-side>*:not(.drawer-overlay){transform:translate(0)}.drawer-open>.drawer-toggle:checked~.drawer-side{pointer-events:auto;visibility:visible}.drawer-open>.drawer-side{overflow-y:auto}html:has(.drawer-toggle:checked){overflow-y:hidden;scrollbar-gutter:stable}.join.join-vertical .join-item:first-child:not(:last-child),.join.join-vertical *:first-child:not(:last-child) .join-item{border-end-start-radius:0;border-end-end-radius:0;border-start-start-radius:inherit;border-start-end-radius:inherit}.join.join-vertical .join-item:last-child:not(:first-child),.join.join-vertical *:last-child:not(:first-child) .join-item{border-start-start-radius:0;border-start-end-radius:0;border-end-start-radius:inherit;border-end-end-radius:inherit}.join.join-horizontal .join-item:first-child:not(:last-child),.join.join-horizontal *:first-child:not(:last-child) .join-item{border-end-end-radius:0;border-start-end-radius:0;border-end-start-radius:inherit;border-start-start-radius:inherit}.join.join-horizontal .join-item:last-child:not(:first-child),.join.join-horizontal *:last-child:not(:first-child) .join-item{border-end-start-radius:0;border-start-start-radius:0;border-end-end-radius:inherit;border-start-end-radius:inherit}.drawer-open>.drawer-toggle~.drawer-side>.drawer-overlay{cursor:default;background-color:transparent}.join.join-vertical>:where(*:not(:first-child)):is(.btn){margin-top:calc(var(--border-btn) * -1)}.join.join-horizontal>:where(*:not(:first-child)):is(.btn){margin-inline-start:calc(var(--border-btn) * -1);margin-top:0}.modal-top :where(.modal-box){width:100%;max-width:none;--tw-translate-y: -2.5rem;--tw-scale-x: 1;--tw-scale-y: 1;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));border-bottom-right-radius:var(--rounded-box, 1rem);border-bottom-left-radius:var(--rounded-box, 1rem);border-top-left-radius:0;border-top-right-radius:0}.modal-middle :where(.modal-box){width:91.666667%;max-width:32rem;--tw-translate-y: 0px;--tw-scale-x: .9;--tw-scale-y: .9;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));border-top-left-radius:var(--rounded-box, 1rem);border-top-right-radius:var(--rounded-box, 1rem);border-bottom-right-radius:var(--rounded-box, 1rem);border-bottom-left-radius:var(--rounded-box, 1rem)}.modal-bottom :where(.modal-box){width:100%;max-width:none;--tw-translate-y: 2.5rem;--tw-scale-x: 1;--tw-scale-y: 1;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));border-top-left-radius:var(--rounded-box, 1rem);border-top-right-radius:var(--rounded-box, 1rem);border-bottom-right-radius:0;border-bottom-left-radius:0}.collapse{visibility:collapse}.relative{position:relative}.sticky{position:sticky}.top-4{top:1rem}.z-10{z-index:10}.z-50{z-index:50}.z-\[1\]{z-index:1}.m-1{margin:.25rem}.mx-4{margin-left:1rem;margin-right:1rem}.mx-auto{margin-left:auto;margin-right:auto}.my-4{margin-top:1rem;margin-bottom:1rem}.mb-2{margin-bottom:.5rem}.mb-4{margin-bottom:1rem}.mb-6{margin-bottom:1.5rem}.ml-2{margin-left:.5rem}.ml-4{margin-left:1rem}.mr-2{margin-right:.5rem}.mt-2{margin-top:.5rem}.mt-4{margin-top:1rem}.mt-6{margin-top:1.5rem}.mt-8{margin-top:2rem}.mt-auto{margin-top:auto}.inline{display:inline}.flex{display:flex}.h-0{height:0px}.h-24{height:6rem}.h-80{height:20rem}.h-\[calc\(90vh-12rem\)\]{height:calc(90vh - 12rem)}.h-screen{height:100vh}.min-h-full{min-height:100%}.w-52{width:13rem}.w-64{width:16rem}.w-\[calc\(90vw-8em\)\]{width:calc(90vw - 8em)}.w-full{width:100%}.w-screen{width:100vw}.max-w-\[calc\(100vw-2em\)\]{max-width:calc(100vw - 2em)}.grow{flex-grow:1}.flex-row{flex-direction:row}.flex-col{flex-direction:column}.items-center{align-items:center}.justify-start{justify-content:flex-start}.justify-center{justify-content:center}.justify-between{justify-content:space-between}.gap-2{gap:.5rem}.overflow-visible{overflow:visible}.overflow-y-auto{overflow-y:auto}.truncate{overflow:hidden;text-overflow:ellipsis;white-space:nowrap}.rounded-box{border-radius:var(--rounded-box, 1rem)}.bg-base-100{--tw-bg-opacity: 1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity, 1)))}.bg-base-200{--tw-bg-opacity: 1;background-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-bg-opacity, 1)))}.bg-base-300{--tw-bg-opacity: 1;background-color:var(--fallback-b3,oklch(var(--b3)/var(--tw-bg-opacity, 1)))}.p-2{padding:.5rem}.px-4{padding-left:1rem;padding-right:1rem}.py-4{padding-top:1rem;padding-bottom:1rem}.text-center{text-align:center}.text-right{text-align:right}.text-2xl{font-size:1.5rem;line-height:2rem}.text-lg{font-size:1.125rem;line-height:1.75rem}.text-xs{font-size:.75rem;line-height:1rem}.font-bold{font-weight:700}.font-normal{font-weight:400}.text-base-content{--tw-text-opacity: 1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity, 1)))}.text-error{--tw-text-opacity: 1;color:var(--fallback-er,oklch(var(--er)/var(--tw-text-opacity, 1)))}.underline{text-decoration-line:underline}.opacity-0{opacity:0}.opacity-40{opacity:.4}.shadow{--tw-shadow: 0 1px 3px 0 rgb(0 0 0 / .1), 0 1px 2px -1px rgb(0 0 0 / .1);--tw-shadow-colored: 0 1px 3px 0 var(--tw-shadow-color), 0 1px 2px -1px var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow, 0 0 #0000),var(--tw-ring-shadow, 0 0 #0000),var(--tw-shadow)}.shadow-2xl{--tw-shadow: 0 25px 50px -12px rgb(0 0 0 / .25);--tw-shadow-colored: 0 25px 50px -12px var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow, 0 0 #0000),var(--tw-ring-shadow, 0 0 #0000),var(--tw-shadow)}.filter{filter:var(--tw-blur) var(--tw-brightness) var(--tw-contrast) var(--tw-grayscale) var(--tw-hue-rotate) var(--tw-invert) var(--tw-saturate) var(--tw-sepia) var(--tw-drop-shadow)}.markdown h1,.markdown h2,.markdown h3,.markdown h4,.markdown h5,.markdown h6,.markdown ul,.markdown ol,.markdown li{all:revert}.markdown pre{white-space:pre-wrap}.markdown pre{border-radius:.5rem}.markdown pre{padding:.5rem}.markdown pre{border:1px solid currentColor}@media (min-width: 768px){.show-on-hover{opacity:0}.group:hover .show-on-hover{opacity:1}}.btn-mini{cursor:pointer}.btn-mini:hover{--tw-shadow: 0 4px 6px -1px rgb(0 0 0 / .1), 0 2px 4px -2px rgb(0 0 0 / .1);--tw-shadow-colored: 0 4px 6px -1px var(--tw-shadow-color), 0 2px 4px -2px var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow, 0 0 #0000),var(--tw-ring-shadow, 0 0 #0000),var(--tw-shadow)}.chat-screen{max-width:900px}.chat-bubble-base-300{--tw-bg-opacity: 1;background-color:var(--fallback-b3,oklch(var(--b3)/var(--tw-bg-opacity, 1)));--tw-text-opacity: 1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity, 1)))}@media (min-width: 1024px){.lg\:drawer-open>.drawer-toggle{display:none}.lg\:drawer-open>.drawer-toggle~.drawer-side{pointer-events:auto;visibility:visible;position:sticky;display:block;width:auto;overscroll-behavior:auto}.lg\:drawer-open>.drawer-toggle~.drawer-side>*:not(.drawer-overlay){transform:translate(0)}[dir=rtl] .lg\:drawer-open>.drawer-toggle~.drawer-side>*:not(.drawer-overlay){transform:translate(0)}.lg\:drawer-open>.drawer-toggle:checked~.drawer-side{pointer-events:auto;visibility:visible}.lg\:drawer-open>.drawer-side{overflow-y:auto}html:has(.lg\:drawer-open.lg\:drawer-open){overflow-y:auto;scrollbar-gutter:auto}.lg\:drawer-open>.drawer-toggle~.drawer-side>.drawer-overlay{cursor:default;background-color:transparent}.lg\:hidden{display:none}.lg\:h-screen{height:100vh}.lg\:w-96{width:24rem}.lg\:max-w-64{max-width:16rem}}</style>
-</head>
-
-<body>
-  <div id="app" class="opacity-0"> <!-- opacity-0 will be removed on app mounted -->
-    <div class="flex flex-row drawer lg:drawer-open">
-      <input id="toggle-drawer" type="checkbox" class="drawer-toggle" checked />
-
-      <!-- sidebar -->
-      <div class="drawer-side h-screen lg:h-screen z-50 lg:max-w-64">
-        <label for="toggle-drawer" aria-label="close sidebar" class="drawer-overlay"></label>
-        <div class="flex flex-col bg-base-200 min-h-full max-w-[calc(100vw-2em)] py-4 px-4">
-          <div class="flex flex-row items-center justify-between mb-4 mt-4">
-            <h2 class="font-bold ml-4">Conversations</h2>
-
-            <!-- close sidebar button -->
-            <label for="toggle-drawer" class="btn btn-ghost lg:hidden">
-              <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-arrow-bar-left" viewBox="0 0 16 16">
-                <path fill-rule="evenodd" d="M12.5 15a.5.5 0 0 1-.5-.5v-13a.5.5 0 0 1 1 0v13a.5.5 0 0 1-.5.5M10 8a.5.5 0 0 1-.5.5H3.707l2.147 2.146a.5.5 0 0 1-.708.708l-3-3a.5.5 0 0 1 0-.708l3-3a.5.5 0 1 1 .708.708L3.707 7.5H9.5a.5.5 0 0 1 .5.5"/>
-              </svg>
-            </label>
-          </div>
-
-          <!-- list of conversations -->
-          <div :class="{
-            'btn btn-ghost justify-start': true,
-            'btn-active': messages.length === 0,
-          }" @click="newConversation">
-            + New conversation
-          </div>
-          <div v-for="conv in conversations" :class="{
-            'btn btn-ghost justify-start font-normal': true,
-            'btn-active': conv.id === viewingConvId,
-          }" @click="setViewingConv(conv.id)">
-            <span class="truncate">{{ conv.messages[0].content }}</span>
-          </div>
-          <div class="text-center text-xs opacity-40 mt-auto mx-4">
-            Conversations are saved to browser's localStorage
-          </div>
-        </div>
-      </div>
-
-      <!-- main view -->
-      <div class="chat-screen drawer-content grow flex flex-col h-screen w-screen mx-auto px-4">
-        <!-- header -->
-        <div class="flex flex-row items-center mt-6 mb-6">
-          <!-- open sidebar button -->
-          <label for="toggle-drawer" class="btn btn-ghost lg:hidden">
-            <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-list" viewBox="0 0 16 16">
-              <path fill-rule="evenodd" d="M2.5 12a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5m0-4a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5m0-4a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5"/>
-            </svg>
-          </label>
-
-          <div class="grow text-2xl font-bold ml-2">llama.cpp</div>
-
-          <!-- action buttons (top right) -->
-          <div class="flex items-center">
-            <div v-if="messages.length > 0" class="dropdown dropdown-end">
-              <!-- "more" button -->
-              <button tabindex="0" role="button" class="btn m-1" :disabled="isGenerating">
-                <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-three-dots-vertical" viewBox="0 0 16 16">
-                  <path d="M9.5 13a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0m0-5a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0m0-5a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0"/>
-                </svg>
-              </button>
-              <!-- "more" dropdown menu -->
-              <ul tabindex="0" class="dropdown-content menu bg-base-100 rounded-box z-[1] w-52 p-2 shadow">
-                <li @click="downloadConv(viewingConvId)"><a>Download</a></li>
-                <li class="text-error" @click="deleteConv(viewingConvId)"><a>Delete</a></li>
-              </ul>
-            </div>
-            <button class="btn" @click="showConfigDialog = true" :disabled="isGenerating">
-              <!-- settings button -->
-              <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-gear" viewBox="0 0 16 16">
-                <path d="M8 4.754a3.246 3.246 0 1 0 0 6.492 3.246 3.246 0 0 0 0-6.492M5.754 8a2.246 2.246 0 1 1 4.492 0 2.246 2.246 0 0 1-4.492 0"/>
-                <path d="M9.796 1.343c-.527-1.79-3.065-1.79-3.592 0l-.094.319a.873.873 0 0 1-1.255.52l-.292-.16c-1.64-.892-3.433.902-2.54 2.541l.159.292a.873.873 0 0 1-.52 1.255l-.319.094c-1.79.527-1.79 3.065 0 3.592l.319.094a.873.873 0 0 1 .52 1.255l-.16.292c-.892 1.64.901 3.434 2.541 2.54l.292-.159a.873.873 0 0 1 1.255.52l.094.319c.527 1.79 3.065 1.79 3.592 0l.094-.319a.873.873 0 0 1 1.255-.52l.292.16c1.64.893 3.434-.902 2.54-2.541l-.159-.292a.873.873 0 0 1 .52-1.255l.319-.094c1.79-.527 1.79-3.065 0-3.592l-.319-.094a.873.873 0 0 1-.52-1.255l.16-.292c.893-1.64-.902-3.433-2.541-2.54l-.292.159a.873.873 0 0 1-1.255-.52zm-2.633.283c.246-.835 1.428-.835 1.674 0l.094.319a1.873 1.873 0 0 0 2.693 1.115l.291-.16c.764-.415 1.6.42 1.184 1.185l-.159.292a1.873 1.873 0 0 0 1.116 2.692l.318.094c.835.246.835 1.428 0 1.674l-.319.094a1.873 1.873 0 0 0-1.115 2.693l.16.291c.415.764-.42 1.6-1.185 1.184l-.291-.159a1.873 1.873 0 0 0-2.693 1.116l-.094.318c-.246.835-1.428.835-1.674 0l-.094-.319a1.873 1.873 0 0 0-2.692-1.115l-.292.16c-.764.415-1.6-.42-1.184-1.185l.159-.291A1.873 1.873 0 0 0 1.945 8.93l-.319-.094c-.835-.246-.835-1.428 0-1.674l.319-.094A1.873 1.873 0 0 0 3.06 4.377l-.16-.292c-.415-.764.42-1.6 1.185-1.184l.292.159a1.873 1.873 0 0 0 2.692-1.115z"/>
-              </svg>
-            </button>
-
-            <!-- theme controller is copied from https://daisyui.com/components/theme-controller/ -->
-            <div class="dropdown dropdown-end dropdown-bottom">
-              <div tabindex="0" role="button" class="btn m-1">
-                <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-palette2" viewBox="0 0 16 16">
-                  <path d="M0 .5A.5.5 0 0 1 .5 0h5a.5.5 0 0 1 .5.5v5.277l4.147-4.131a.5.5 0 0 1 .707 0l3.535 3.536a.5.5 0 0 1 0 .708L10.261 10H15.5a.5.5 0 0 1 .5.5v5a.5.5 0 0 1-.5.5H3a3 3 0 0 1-2.121-.879A3 3 0 0 1 0 13.044m6-.21 7.328-7.3-2.829-2.828L6 7.188zM4.5 13a1.5 1.5 0 1 0-3 0 1.5 1.5 0 0 0 3 0M15 15v-4H9.258l-4.015 4zM0 .5v12.495zm0 12.495V13z"/>
-                </svg>
-              </div>
-              <ul tabindex="0" class="dropdown-content bg-base-300 rounded-box z-[1] w-52 p-2 shadow-2xl h-80 overflow-y-auto">
-                <li>
-                  <button
-                    class="btn btn-sm btn-block btn-ghost justify-start"
-                    :class="{ 'btn-active': selectedTheme === 'auto' }"
-                    @click="setSelectedTheme('auto')">
-                    auto
-                  </button>
-                </li>
-                <li v-for="theme in themes">
-                  <input
-                    type="radio"
-                    name="theme-dropdown"
-                    class="theme-controller btn btn-sm btn-block btn-ghost justify-start"
-                    :aria-label="theme"
-                    :value="theme"
-                    :checked="selectedTheme === theme"
-                    @click="setSelectedTheme(theme)" />
-                </li>
-              </ul>
-            </div>
-          </div>
-        </div>
-
-        <!-- chat messages -->
-        <div id="messages-list" class="flex flex-col grow overflow-y-auto">
-          <div class="mt-auto flex justify-center">
-            <!-- placeholder to shift the message to the bottom -->
-            {{ messages.length === 0 ? 'Send a message to start' : '' }}
-          </div>
-          <div v-for="msg in messages" class="group">
-            <div :class="{
-              'chat': true,
-              'chat-start': msg.role !== 'user',
-              'chat-end': msg.role === 'user',
-            }">
-              <div :class="{
-                'chat-bubble markdown': true,
-                'chat-bubble-base-300': msg.role !== 'user',
-              }">
-                <!-- textarea for editing message -->
-                <template v-if="editingMsg && editingMsg.id === msg.id">
-                  <textarea
-                    class="textarea textarea-bordered bg-base-100 text-base-content w-[calc(90vw-8em)] lg:w-96"
-                    v-model="msg.content"></textarea>
-                  <br/>
-                  <button class="btn btn-ghost mt-2 mr-2" @click="editingMsg = null">Cancel</button>
-                  <button class="btn mt-2" @click="editUserMsgAndRegenerate(msg)">Submit</button>
-                </template>
-                <!-- render message as markdown -->
-                <vue-markdown v-else :source="msg.content" />
-              </div>
-            </div>
-
-            <!-- actions for each message -->
-            <div :class="{'text-right': msg.role === 'user'}" class="mx-4 mt-2 mb-2">
-              <!-- user message -->
-              <button v-if="msg.role === 'user'" class="badge btn-mini show-on-hover" @click="editingMsg = msg" :disabled="isGenerating">
-                ✍️ Edit
-              </button>
-              <!-- assistant message -->
-              <button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="regenerateMsg(msg)" :disabled="isGenerating">
-                🔄 Regenerate
-              </button>
-              <button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="copyMsg(msg)" :disabled="isGenerating">
-                📋 Copy
-              </button>
-            </div>
-          </div>
-
-          <!-- pending (ongoing) assistant message -->
-          <div id="pending-msg" class="chat chat-start">
-            <div v-if="pendingMsg" class="chat-bubble markdown chat-bubble-base-300">
-              <span v-if="!pendingMsg.content" class="loading loading-dots loading-md"></span>
-              <vue-markdown v-else :source="pendingMsg.content" />
-            </div>
-          </div>
-        </div>
-
-        <!-- chat input -->
-        <div class="flex flex-row items-center mt-8 mb-6">
-          <textarea
-            class="textarea textarea-bordered w-full"
-            placeholder="Type a message (Shift+Enter to add a new line)"
-            v-model="inputMsg"
-            @keydown.enter.exact.prevent="sendMessage"
-            @keydown.enter.shift.exact.prevent="inputMsg += '\n'"
-            :disabled="isGenerating"
-            id="msg-input"
-          ></textarea>
-          <button v-if="!isGenerating" class="btn btn-primary ml-2" @click="sendMessage" :disabled="inputMsg.length === 0">Send</button>
-          <button v-else class="btn btn-neutral ml-2" @click="stopGeneration">Stop</button>
-        </div>
-      </div>
-
-    </div>
-
-
-    <!-- modal for editing config -->
-    <dialog class="modal" :class="{'modal-open': showConfigDialog}">
-      <div class="modal-box">
-        <h3 class="text-lg font-bold mb-6">Settings</h3>
-        <div class="h-[calc(90vh-12rem)] overflow-y-auto">
-          <p class="opacity-40 mb-6">Settings below are saved in browser's localStorage</p>
-          <settings-modal-short-input :config-key="'apiKey'" :config-default="configDefault" :config-info="configInfo" v-model="config.apiKey"></settings-modal-short-input>
-          <label class="form-control mb-2">
-            <div class="label">System Message</div>
-            <textarea class="textarea textarea-bordered h-24" :placeholder="'Default: ' + configDefault.systemMessage" v-model="config.systemMessage"></textarea>
-          </label>
-          <template v-for="configKey in ['temperature', 'top_k', 'top_p', 'min_p', 'max_tokens']">
-            <settings-modal-short-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]"></settings-modal-short-input>
-          </template>
-          <!-- TODO: add more sampling-related configs, please regroup them into different "collapse" sections -->
-          <!-- Section: Other sampler settings -->
-          <details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
-            <summary class="collapse-title font-bold">Other sampler settings</summary>
-            <div class="collapse-content">
-              <!-- Samplers queue -->
-              <settings-modal-short-input label="Samplers queue" :config-key="'samplers'" :config-default="configDefault" :config-info="configInfo" v-model="config.samplers"></settings-modal-short-input>
-              <!-- Samplers -->
-              <template v-for="configKey in ['dynatemp_range', 'dynatemp_exponent', 'typical_p', 'xtc_probability', 'xtc_threshold']">
-                <settings-modal-short-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]"></settings-modal-short-input>
-              </template>
-            </div>
-          </details>
-          <!-- Section: Penalties settings -->
-          <details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
-            <summary class="collapse-title font-bold">Penalties settings</summary>
-            <div class="collapse-content">
-              <template v-for="configKey in ['repeat_last_n', 'repeat_penalty', 'presence_penalty', 'frequency_penalty', 'dry_multiplier', 'dry_base', 'dry_allowed_length', 'dry_penalty_last_n']">
-                <settings-modal-short-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]"></settings-modal-short-input>
-              </template>
-            </div>
-          </details>
-          <!-- Section: Advanced config -->
-          <details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
-            <summary class="collapse-title font-bold">Advanced config</summary>
-            <div class="collapse-content">
-              <label class="form-control mb-2">
-                <!-- Custom parameters input -->
-                <div class="label inline">Custom JSON config (For more info, refer to <a class="underline" href="https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md" target="_blank" rel="noopener noreferrer">server documentation</a>)</div>
-                <textarea class="textarea textarea-bordered h-24" placeholder="Example: { &quot;mirostat&quot;: 1, &quot;min_p&quot;: 0.1 }" v-model="config.custom"></textarea>
-              </label>
-            </div>
-          </details>
-        </div>
-
-        <!-- action buttons -->
-        <div class="modal-action">
-          <button class="btn" @click="resetConfigDialog">Reset to default</button>
-          <button class="btn" @click="closeAndDiscardConfigDialog">Close</button>
-          <button class="btn btn-primary" @click="closeAndSaveConfigDialog">Save</button>
-        </div>
-      </div>
-    </dialog>
-
-  </div>
-
-  <!-- Template to be used by settings modal -->
-  <template id="settings-modal-short-input">
-    <label class="input input-bordered join-item grow flex items-center gap-2 mb-2">
-      <!-- Show help message on hovering on the input label -->
-      <div class="dropdown dropdown-hover">
-        <div tabindex="0" role="button" class="font-bold">{{ label || configKey }}</div>
-        <div class="dropdown-content menu bg-base-100 rounded-box z-10 w-64 p-2 shadow mt-4">
-          {{ configInfo[configKey] || '(no help message available)' }}
-        </div>
-      </div>
-      <!-- Here we forward v-model from parent to child component, see: https://stackoverflow.com/questions/47311936/v-model-and-child-components -->
-      <input type="text" class="grow" :placeholder="'Default: ' + (configDefault[configKey] || 'none')" :value="modelValue" @input="$emit('update:modelValue', $event.target.value)" />
-    </label>
-  </template>
-
-</body>
-
-</html>
diff --git a/examples/server/public/index.html.gz b/examples/server/public/index.html.gz
new file mode 100644
index 00000000000..36f9c9fe9a6
Binary files /dev/null and b/examples/server/public/index.html.gz differ
diff --git a/examples/server/public_legacy/index-new.html b/examples/server/public_legacy/index-new.html
index 8bfa380e573..cbfbbdf2806 100644
--- a/examples/server/public_legacy/index-new.html
+++ b/examples/server/public_legacy/index-new.html
@@ -39,7 +39,6 @@
       temperature: 0.8, // adapt all following parameters to optimized min-p requierements. If for non-english, set to 0.6 or lower
       repeat_last_n: 0, // 0 = disable penalty, -1 = context size
       repeat_penalty: 1.0, // 1.0 = disabled
-      penalize_nl: false, // true only useful for infinite completion
       dry_multiplier: 0.0, // 0.0 = disabled, 0.8 works well
       dry_base: 1.75,     // 0.0 = disabled
       dry_allowed_length: 2, // tokens extending repetitions beyond this receive penalty, 2 works well
diff --git a/examples/server/public_legacy/index.html b/examples/server/public_legacy/index.html
index a95f5c6df87..75f39330a78 100644
--- a/examples/server/public_legacy/index.html
+++ b/examples/server/public_legacy/index.html
@@ -303,7 +303,6 @@
       temperature: 0.7,
       repeat_last_n: 256, // 0 = disable penalty, -1 = context size
       repeat_penalty: 1.18, // 1.0 = disabled
-      penalize_nl: false,
       dry_multiplier: 0.0, // 0.0 = disabled, 0.8 works well
       dry_base: 1.75,     // 0.0 = disabled
       dry_allowed_length: 2, // tokens extending repetitions beyond this receive penalty, 2 works well
@@ -1006,7 +1005,6 @@
             ${FloatField({ label: "Temperature", max: 2.0, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })}
             ${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })}
             ${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })}
-            ${BoolField({ label: "Penalize repetition of newlines", name: "penalize_nl", value: params.value.penalize_nl })}
             ${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })}
             ${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })}
             ${FloatField({ label: "Min-P sampling", max: 1.0, min: 0.0, name: "min_p", step: 0.01, value: params.value.min_p })}
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 8cb992470a3..30ff3b14957 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -15,7 +15,7 @@
 #define MIMETYPE_JSON "application/json; charset=utf-8"
 
 // auto generated files (update with ./deps.sh)
-#include "index.html.hpp"
+#include "index.html.gz.hpp"
 #include "loading.html.hpp"
 
 #include <atomic>
@@ -79,8 +79,9 @@ enum error_type {
 };
 
 struct slot_params {
-    bool stream       = true;
-    bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
+    bool stream        = true;
+    bool cache_prompt  = true; // remember the prompt to avoid reprocessing all prompt
+    bool return_tokens = false;
 
     int32_t n_keep    =  0; // number of tokens to keep from initial prompt
     int32_t n_discard =  0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
@@ -91,7 +92,9 @@ struct slot_params {
     int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
 
     std::vector<std::string> antiprompt;
+    std::vector<std::string> response_fields;
     bool timings_per_token = false;
+    bool post_sampling_probs = false;
     bool ignore_eos = false;
 
     struct common_params_sampling sampling;
@@ -135,7 +138,6 @@ struct slot_params {
             {"mirostat",                  sampling.mirostat},
             {"mirostat_tau",              sampling.mirostat_tau},
             {"mirostat_eta",              sampling.mirostat_eta},
-            {"penalize_nl",               sampling.penalize_nl},
             {"stop",                      antiprompt},
             {"max_tokens",                n_predict}, // User configured n_predict
             {"n_keep",                    n_keep},
@@ -151,6 +153,7 @@ struct slot_params {
             {"speculative.n_min",         speculative.n_min},
             {"speculative.p_min",         speculative.p_min},
             {"timings_per_token",         timings_per_token},
+            {"post_sampling_probs",       post_sampling_probs},
         };
     }
 };
@@ -184,6 +187,7 @@ struct server_task {
 
     static slot_params params_from_json_cmpl(
             const llama_model * model,
+            const llama_context * ctx,
             const common_params & params_base,
             const json & data) {
         slot_params params;
@@ -199,12 +203,14 @@ struct server_task {
 
         params.stream           = json_value(data, "stream",             false);
         params.cache_prompt     = json_value(data, "cache_prompt",       true);
+        params.return_tokens    = json_value(data, "return_tokens",      false);
         params.n_predict        = json_value(data, "n_predict",          json_value(data, "max_tokens", defaults.n_predict));
         params.n_indent         = json_value(data, "n_indent",           defaults.n_indent);
         params.n_keep           = json_value(data, "n_keep",             defaults.n_keep);
         params.n_discard        = json_value(data, "n_discard",          defaults.n_discard);
       //params.t_max_prompt_ms  = json_value(data, "t_max_prompt_ms",    defaults.t_max_prompt_ms); // TODO: implement
         params.t_max_predict_ms = json_value(data, "t_max_predict_ms",   defaults.t_max_predict_ms);
+        params.response_fields  = json_value(data, "response_fields",   std::vector<std::string>());
 
         params.sampling.top_k              = json_value(data, "top_k",              defaults.sampling.top_k);
         params.sampling.top_p              = json_value(data, "top_p",              defaults.sampling.top_p);
@@ -226,10 +232,10 @@ struct server_task {
         params.sampling.mirostat           = json_value(data, "mirostat",           defaults.sampling.mirostat);
         params.sampling.mirostat_tau       = json_value(data, "mirostat_tau",       defaults.sampling.mirostat_tau);
         params.sampling.mirostat_eta       = json_value(data, "mirostat_eta",       defaults.sampling.mirostat_eta);
-        params.sampling.penalize_nl        = json_value(data, "penalize_nl",        defaults.sampling.penalize_nl);
         params.sampling.seed               = json_value(data, "seed",               defaults.sampling.seed);
         params.sampling.n_probs            = json_value(data, "n_probs",            defaults.sampling.n_probs);
         params.sampling.min_keep           = json_value(data, "min_keep",           defaults.sampling.min_keep);
+        params.post_sampling_probs         = json_value(data, "post_sampling_probs", defaults.post_sampling_probs);
 
         params.speculative.n_min = json_value(data, "speculative.n_min", defaults.speculative.n_min);
         params.speculative.n_max = json_value(data, "speculative.n_max", defaults.speculative.n_max);
@@ -239,8 +245,27 @@ struct server_task {
         params.speculative.n_min = std::max(params.speculative.n_min, 2);
         params.speculative.n_max = std::max(params.speculative.n_max, 0);
 
+        // TODO: add more sanity checks for the input parameters
+
+        if (params.sampling.penalty_last_n < -1) {
+            throw std::runtime_error("Error: repeat_last_n must be >= -1");
+        }
+
+        if (params.sampling.dry_penalty_last_n < -1) {
+            throw std::runtime_error("Error: dry_penalty_last_n must be >= -1");
+        }
+
+        if (params.sampling.penalty_last_n == -1) {
+            // note: should be the slot's context and not the full context, but it's ok
+            params.sampling.penalty_last_n = llama_n_ctx(ctx);
+        }
+
+        if (params.sampling.dry_penalty_last_n == -1) {
+            params.sampling.dry_penalty_last_n = llama_n_ctx(ctx);
+        }
+
         if (params.sampling.dry_base < 1.0f) {
-           params.sampling.dry_base = defaults.sampling.dry_base;
+            params.sampling.dry_base = defaults.sampling.dry_base;
         }
 
         // sequence breakers for DRY
@@ -416,41 +441,75 @@ inline std::string stop_type_to_str(stop_type type) {
 
 struct completion_token_output {
     llama_token tok;
+    float prob;
     std::string text_to_send;
-    struct token_prob {
+    struct prob_info {
         llama_token tok;
-        std::string tok_str;
+        std::string txt;
         float prob;
     };
-    std::vector<token_prob> probs;
+    std::vector<prob_info> probs;
 
-    json to_json() const {
+    json to_json(bool post_sampling_probs) const {
         json probs_for_token = json::array();
         for (const auto & p : probs) {
+            std::string txt(p.txt);
+            txt.resize(validate_utf8(txt));
             probs_for_token.push_back(json {
-                {"tok_str", p.tok_str},
-                {"prob",    p.prob},
+                {"id",      p.tok},
+                {"token",   txt},
+                {"bytes",   str_to_bytes(p.txt)},
+                {
+                    post_sampling_probs ? "prob" : "logprob",
+                    post_sampling_probs ? p.prob : logarithm(p.prob)
+                },
             });
         }
         return probs_for_token;
     }
 
-    static json probs_vector_to_json(const std::vector<completion_token_output> & probs) {
+    static json probs_vector_to_json(const std::vector<completion_token_output> & probs, bool post_sampling_probs) {
         json out = json::array();
-        for (const auto & prob : probs) {
-            const std::string tok_str = prob.text_to_send;
+        for (const auto & p : probs) {
+            std::string txt(p.text_to_send);
+            txt.resize(validate_utf8(txt));
             out.push_back(json {
-                {"content", tok_str},
-                {"probs",   prob.to_json()},
+                {"id",           p.tok},
+                {"token",        txt},
+                {"bytes",        str_to_bytes(p.text_to_send)},
+                {
+                    post_sampling_probs ? "prob" : "logprob",
+                    post_sampling_probs ? p.prob : logarithm(p.prob)
+                },
+                {
+                    post_sampling_probs ? "top_probs" : "top_logprobs",
+                    p.to_json(post_sampling_probs)
+                },
             });
         }
         return out;
     }
+
+    static float logarithm(float x) {
+        // nlohmann::json converts -inf to null, so we need to prevent that
+        return x == 0.0f ? std::numeric_limits<float>::lowest() : std::log(x);
+    }
+
+    static std::vector<unsigned char> str_to_bytes(const std::string & str) {
+        std::vector<unsigned char> bytes;
+        for (unsigned char c : str) {
+            bytes.push_back(c);
+        }
+        return bytes;
+    }
 };
 
 struct server_task_result_cmpl_final : server_task_result {
     int index = 0;
-    std::string content;
+
+    std::string  content;
+    llama_tokens tokens;
+
     bool stream;
     result_timings timings;
     std::string prompt;
@@ -459,11 +518,13 @@ struct server_task_result_cmpl_final : server_task_result {
     int32_t n_decoded;
     int32_t n_prompt_tokens;
     int32_t n_tokens_cached;
-    int32_t has_new_line;
+    bool has_new_line;
     std::string stopping_word;
     stop_type stop = STOP_TYPE_NONE;
 
+    bool post_sampling_probs;
     std::vector<completion_token_output> probs_output;
+    std::vector<std::string>  response_fields;
 
     slot_params generation_params;
 
@@ -492,6 +553,7 @@ struct server_task_result_cmpl_final : server_task_result {
         json res = json {
             {"index",               index},
             {"content",             stream ? "" : content}, // in stream mode, content is already in last partial chunk
+            {"tokens",              stream ? llama_tokens {} : tokens},
             {"id_slot",             id_slot},
             {"stop",                true},
             {"model",               oaicompat_model},
@@ -506,10 +568,10 @@ struct server_task_result_cmpl_final : server_task_result {
             {"tokens_cached",       n_tokens_cached},
             {"timings",             timings.to_json()},
         };
-        if (!probs_output.empty()) {
-            res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output);
+        if (!stream && !probs_output.empty()) {
+            res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs);
         }
-        return res;
+        return response_fields.empty() ? res : json_get_nested_values(response_fields, res);
     }
 
     json to_json_oaicompat_chat() {
@@ -518,22 +580,29 @@ struct server_task_result_cmpl_final : server_task_result {
             finish_reason = "stop";
         }
 
-        json choices = json::array({json{
+        json choice = json{
             {"finish_reason", finish_reason},
             {"index", 0},
-            {"message", json{
+            {"message", json {
                 {"content", content},
-                {"role", "assistant"}
+                {"role",    "assistant"}
             }
-        }}});
+        }};
+
+        if (!stream && probs_output.size() > 0) {
+            choice["logprobs"] = json{
+                {"content", completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs)},
+            };
+        }
 
         std::time_t t = std::time(0);
 
         json res = json {
-            {"choices", choices},
-            {"created", t},
-            {"model", oaicompat_model},
-            {"object", "chat.completion"},
+            {"choices",            json::array({choice})},
+            {"created",            t},
+            {"model",              oaicompat_model},
+            {"system_fingerprint", build_info},
+            {"object",             "chat.completion"},
             {"usage", json {
                 {"completion_tokens", n_decoded},
                 {"prompt_tokens",     n_prompt_tokens},
@@ -560,16 +629,19 @@ struct server_task_result_cmpl_final : server_task_result {
             finish_reason = "stop";
         }
 
-        json choices = json::array({json{{"finish_reason", finish_reason},
-                                        {"index", 0},
-                                        {"delta", json::object()}}});
+        json choice = json{
+            {"finish_reason", finish_reason},
+            {"index", 0},
+            {"delta", json::object()}
+        };
 
         json ret = json {
-            {"choices", choices},
-            {"created", t},
-            {"id",      oaicompat_cmpl_id},
-            {"model",   oaicompat_model},
-            {"object",  "chat.completion.chunk"},
+            {"choices",            json::array({choice})},
+            {"created",            t},
+            {"id",                 oaicompat_cmpl_id},
+            {"model",              oaicompat_model},
+            {"system_fingerprint", build_info},
+            {"object",             "chat.completion.chunk"},
             {"usage", json {
                 {"completion_tokens", n_decoded},
                 {"prompt_tokens",     n_prompt_tokens},
@@ -587,12 +659,15 @@ struct server_task_result_cmpl_final : server_task_result {
 
 struct server_task_result_cmpl_partial : server_task_result {
     int index = 0;
-    std::string content;
+
+    std::string  content;
+    llama_tokens tokens;
 
     int32_t n_decoded;
     int32_t n_prompt_tokens;
 
-    std::vector<completion_token_output> probs_output;
+    bool post_sampling_probs;
+    completion_token_output prob_output;
     result_timings timings;
 
     // OAI-compat fields
@@ -619,6 +694,7 @@ struct server_task_result_cmpl_partial : server_task_result {
         json res = json {
             {"index",            index},
             {"content",          content},
+            {"tokens",           tokens},
             {"stop",             false},
             {"id_slot",          id_slot},
             {"tokens_predicted", n_decoded},
@@ -628,8 +704,8 @@ struct server_task_result_cmpl_partial : server_task_result {
         if (timings.prompt_n > 0) {
             res.push_back({"timings", timings.to_json()});
         }
-        if (!probs_output.empty()) {
-            res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output);
+        if (!prob_output.probs.empty()) {
+            res["completion_probabilities"] = completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs);
         }
         return res;
     }
@@ -660,7 +736,7 @@ struct server_task_result_cmpl_partial : server_task_result {
                 json second_ret = json{
                             {"choices", json::array({json{{"finish_reason", nullptr},
                                                             {"index", 0},
-                                                            {"delta", json{
+                                                            {"delta", json {
                                                             {"content", content}}}
                                                             }})},
                             {"created", t},
@@ -675,18 +751,27 @@ struct server_task_result_cmpl_partial : server_task_result {
                 {"finish_reason", nullptr},
                 {"index", 0},
                 {"delta",
-                json{
+                json {
                     {"content", content},
                 }},
             }});
         }
 
+        GGML_ASSERT(choices.size() >= 1);
+
+        if (prob_output.probs.size() > 0) {
+            choices[0]["logprobs"] = json{
+                {"content", completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs)},
+            };
+        }
+
         json ret = json {
-            {"choices", choices},
-            {"created", t},
-            {"id",      oaicompat_cmpl_id},
-            {"model",   oaicompat_model},
-            {"object",  "chat.completion.chunk"}
+            {"choices",            choices},
+            {"created",            t},
+            {"id",                 oaicompat_cmpl_id},
+            {"model",              oaicompat_model},
+            {"system_fingerprint", build_info},
+            {"object",             "chat.completion.chunk"}
         };
 
         if (timings.prompt_n >= 0) {
@@ -699,32 +784,52 @@ struct server_task_result_cmpl_partial : server_task_result {
 
 struct server_task_result_embd : server_task_result {
     int index = 0;
-    std::vector<float> embedding;
+    std::vector<std::vector<float>> embedding;
+
+    int32_t n_tokens;
+
+    // OAI-compat fields
+    bool oaicompat = false;
 
     virtual int get_index() override {
         return index;
     }
 
     virtual json to_json() override {
+        return oaicompat ? to_json_oaicompat() : to_json_non_oaicompat();
+    }
+
+    json to_json_non_oaicompat() {
         return json {
             {"index",     index},
             {"embedding", embedding},
         };
     }
+
+    json to_json_oaicompat() {
+        return json {
+            {"index",            index},
+            {"embedding",        embedding[0]},
+            {"tokens_evaluated", n_tokens},
+        };
+    }
 };
 
 struct server_task_result_rerank : server_task_result {
     int index = 0;
     float score = -1e6;
 
+    int32_t n_tokens;
+
     virtual int get_index() override {
         return index;
     }
 
     virtual json to_json() override {
         return json {
-            {"index", index},
-            {"score", score},
+            {"index",            index},
+            {"score",            score},
+            {"tokens_evaluated", n_tokens},
         };
     }
 };
@@ -931,8 +1036,11 @@ struct server_slot {
 
     size_t last_nl_pos = 0;
 
-    std::string generated_text;
+    std::string  generated_text;
+    llama_tokens generated_tokens;
+
     llama_tokens cache_tokens;
+
     std::vector<completion_token_output> generated_token_probs;
 
     bool has_next_token = true;
@@ -951,7 +1059,6 @@ struct server_slot {
 
     // stats
     size_t n_sent_text        = 0; // number of sent text character
-    size_t n_sent_token_probs = 0;
 
     int64_t t_start_process_prompt;
     int64_t t_start_generation;
@@ -973,9 +1080,9 @@ struct server_slot {
         stopping_word      = "";
         n_past             = 0;
         n_sent_text        = 0;
-        n_sent_token_probs = 0;
         task_type          = SERVER_TASK_TYPE_COMPLETION;
 
+        generated_tokens.clear();
         generated_token_probs.clear();
     }
 
@@ -1079,9 +1186,9 @@ struct server_slot {
 
         SLT_INF(*this,
                 "\n"
-                "\rprompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
-                "\r       eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
-                "\r      total time = %10.2f ms / %5d tokens\n",
+                "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
+                "       eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
+                "      total time = %10.2f ms / %5d tokens\n",
                 t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second,
                 t_token_generation, n_decoded, t_gen, n_gen_second,
                 t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded);
@@ -1469,7 +1576,7 @@ struct server_context {
         n_ctx = llama_n_ctx(ctx);
 
         add_bos_token = llama_add_bos_token(model);
-        has_eos_token = !llama_add_eos_token(model);
+        has_eos_token = llama_token_eos(model) != LLAMA_TOKEN_NULL;
 
         if (!params_base.speculative.model.empty()) {
             SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str());
@@ -1713,35 +1820,19 @@ struct server_context {
 
     bool process_token(completion_token_output & result, server_slot & slot) {
         // remember which tokens were sampled - used for repetition penalties during sampling
-        const std::string token_str = common_token_to_piece(ctx, result.tok, params_base.special);
+        const std::string token_str = result.text_to_send;
         slot.sampled = result.tok;
 
-        // search stop word and delete it
         slot.generated_text += token_str;
+        if (slot.params.return_tokens) {
+            slot.generated_tokens.push_back(result.tok);
+        }
         slot.has_next_token = true;
 
         // check if there is incomplete UTF-8 character at the end
-        bool incomplete = false;
-        for (unsigned i = 1; i < 5 && i <= slot.generated_text.size(); ++i) {
-            unsigned char c = slot.generated_text[slot.generated_text.size() - i];
-            if ((c & 0xC0) == 0x80) {
-                // continuation byte: 10xxxxxx
-                continue;
-            }
-            if ((c & 0xE0) == 0xC0) {
-                // 2-byte character: 110xxxxx ...
-                incomplete = i < 2;
-            } else if ((c & 0xF0) == 0xE0) {
-                // 3-byte character: 1110xxxx ...
-                incomplete = i < 3;
-            } else if ((c & 0xF8) == 0xF0) {
-                // 4-byte character: 11110xxx ...
-                incomplete = i < 4;
-            }
-            // else 1-byte character or invalid byte
-            break;
-        }
+        bool incomplete = validate_utf8(slot.generated_text) < slot.generated_text.size();
 
+        // search stop word and delete it
         if (!incomplete) {
             size_t pos = std::min(slot.n_sent_text, slot.generated_text.size());
 
@@ -1869,6 +1960,55 @@ struct server_context {
         return slot.has_next_token; // continue
     }
 
+    void populate_token_probs(const server_slot & slot, completion_token_output & result, bool post_sampling, bool special, int idx) {
+        size_t n_probs = slot.params.sampling.n_probs;
+        size_t n_vocab = llama_n_vocab(llama_get_model(ctx));
+        if (post_sampling) {
+            const auto * cur_p = common_sampler_get_candidates(slot.smpl);
+            const size_t max_probs = cur_p->size;
+
+            // set probability for sampled token
+            for (size_t i = 0; i < max_probs; i++) {
+                if (cur_p->data[i].id == result.tok) {
+                    result.prob = cur_p->data[i].p;
+                    break;
+                }
+            }
+
+            // set probability for top n_probs tokens
+            result.probs.reserve(max_probs);
+            for (size_t i = 0; i < std::min(max_probs, n_probs); i++) {
+                result.probs.push_back({
+                    cur_p->data[i].id,
+                    common_detokenize(ctx, {cur_p->data[i].id}, special),
+                    cur_p->data[i].p
+                });
+            }
+        } else {
+            // TODO: optimize this with min-p optimization
+            std::vector<llama_token_data> cur = get_token_probabilities(ctx, idx);
+
+            // set probability for sampled token
+            for (size_t i = 0; i < n_vocab; i++) {
+                // set probability for sampled token
+                if (cur[i].id == result.tok) {
+                    result.prob = cur[i].p;
+                    break;
+                }
+            }
+
+            // set probability for top n_probs tokens
+            result.probs.reserve(n_probs);
+            for (size_t i = 0; i < std::min(n_vocab, n_probs); i++) {
+                result.probs.push_back({
+                    cur[i].id,
+                    common_detokenize(ctx, {cur[i].id}, special),
+                    cur[i].p
+                });
+            }
+        }
+    }
+
     void send_error(const server_task & task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) {
         send_error(task.id, error, type);
     }
@@ -1894,9 +2034,11 @@ struct server_context {
         res->id      = slot.id_task;
         res->index   = slot.index;
         res->content = tkn.text_to_send;
+        res->tokens  = { tkn.tok };
 
-        res->n_decoded       = slot.n_decoded;
-        res->n_prompt_tokens = slot.n_prompt_tokens;
+        res->n_decoded           = slot.n_decoded;
+        res->n_prompt_tokens     = slot.n_prompt_tokens;
+        res->post_sampling_probs = slot.params.post_sampling_probs;
 
         res->verbose           = slot.params.verbose;
         res->oaicompat         = slot.params.oaicompat;
@@ -1906,17 +2048,7 @@ struct server_context {
 
         // populate res.probs_output
         if (slot.params.sampling.n_probs > 0) {
-            const llama_tokens to_send_toks = common_tokenize(ctx, tkn.text_to_send, false);
-
-            const size_t probs_pos      = std::min(slot.n_sent_token_probs,                       slot.generated_token_probs.size());
-            const size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size());
-
-            std::vector<completion_token_output> probs_output;
-            if (probs_pos < probs_stop_pos) {
-                res->probs_output = std::vector<completion_token_output>(
-                        slot.generated_token_probs.begin() + probs_pos,
-                        slot.generated_token_probs.begin() + probs_stop_pos);
-            }
+            res->prob_output = tkn; // copy the token probs
         }
 
         // populate timings if this is final response or timings_per_token is enabled
@@ -1934,16 +2066,19 @@ struct server_context {
 
         res->index           = slot.index;
         res->content         = slot.generated_text;
+        res->tokens          = slot.generated_tokens;
         res->timings         = slot.get_timings();
         res->prompt          = common_detokenize(ctx, slot.prompt_tokens, true);
+        res->response_fields = slot.params.response_fields;
 
-        res->truncated       = slot.truncated;
-        res->n_decoded       = slot.n_decoded;
-        res->n_prompt_tokens = slot.n_prompt_tokens;
-        res->n_tokens_cached = slot.n_past;
-        res->has_new_line    = slot.has_new_line;
-        res->stopping_word   = slot.stopping_word;
-        res->stop            = slot.stop;
+        res->truncated           = slot.truncated;
+        res->n_decoded           = slot.n_decoded;
+        res->n_prompt_tokens     = slot.n_prompt_tokens;
+        res->n_tokens_cached     = slot.n_past;
+        res->has_new_line        = slot.has_new_line;
+        res->stopping_word       = slot.stopping_word;
+        res->stop                = slot.stop;
+        res->post_sampling_probs = slot.params.post_sampling_probs;
 
         res->verbose           = slot.params.verbose;
         res->stream            = slot.params.stream;
@@ -1975,8 +2110,10 @@ struct server_context {
 
     void send_embedding(const server_slot & slot, const llama_batch & batch) {
         auto res = std::make_unique<server_task_result_embd>();
-        res->id    = slot.id_task;
-        res->index = slot.index;
+        res->id        = slot.id_task;
+        res->index     = slot.index;
+        res->n_tokens  = slot.n_prompt_tokens;
+        res->oaicompat = slot.params.oaicompat;
 
         const int n_embd = llama_n_embd(model);
 
@@ -1995,12 +2132,18 @@ struct server_context {
             if (embd == NULL) {
                 SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]);
 
-                res->embedding = std::vector<float>(n_embd, 0.0f);
+                res->embedding.push_back(std::vector<float>(n_embd, 0.0f));
                 continue;
             }
 
-            common_embd_normalize(embd, embd_res.data(), n_embd);
-            res->embedding = embd_res;
+            // normalize only when there is pooling
+            // TODO: configurable
+            if (llama_pooling_type(slot.ctx) != LLAMA_POOLING_TYPE_NONE) {
+                common_embd_normalize(embd, embd_res.data(), n_embd, 2);
+                res->embedding.push_back(embd_res);
+            } else {
+                res->embedding.push_back({ embd, embd + n_embd });
+            }
         }
 
         SLT_DBG(slot, "%s", "sending embeddings\n");
@@ -2012,6 +2155,7 @@ struct server_context {
         auto res = std::make_unique<server_task_result_rerank>();
         res->id    = slot.id_task;
         res->index = slot.index;
+        res->n_tokens = slot.n_prompt_tokens;
 
         for (int i = 0; i < batch.n_tokens; ++i) {
             if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
@@ -2613,7 +2757,10 @@ struct server_context {
 
                     // add prompt tokens for processing in the current batch
                     while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) {
-                        common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id }, false);
+                        // without pooling, we want to output the embeddings for all the tokens in the batch
+                        const bool need_embd = slot.task_type == SERVER_TASK_TYPE_EMBEDDING && llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE;
+
+                        common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id }, need_embd);
 
                         if (slot.params.cache_prompt) {
                             slot.cache_tokens.push_back(prompt_tokens[slot.n_past]);
@@ -2728,7 +2875,9 @@ struct server_context {
                     continue; // continue loop of slots
                 }
 
-                llama_token id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
+                const int tok_idx = slot.i_batch - i;
+
+                llama_token id = common_sampler_sample(slot.smpl, ctx, tok_idx);
 
                 slot.i_batch = -1;
 
@@ -2747,17 +2896,12 @@ struct server_context {
                 slot.t_token_generation = (t_current - slot.t_start_generation) / 1e3;
 
                 completion_token_output result;
-                result.tok = id;
-
-                const auto * cur_p = common_sampler_get_candidates(slot.smpl);
+                result.tok          = id;
+                result.text_to_send = common_token_to_piece(ctx, result.tok, params_base.special);
+                result.prob         = 1.0f; // TODO: set it here instead of doing inside populate_token_probs
 
-                for (size_t i = 0; i < (size_t) slot.params.sampling.n_probs; ++i) {
-                    auto tok_id = cur_p->data[i].id;
-                    result.probs.push_back({
-                        tok_id,
-                        tokens_to_output_formatted_string(ctx, tok_id),
-                        i >= cur_p->size ? 0.0f : cur_p->data[i].p,
-                    });
+                if (slot.params.sampling.n_probs > 0) {
+                    populate_token_probs(slot, result, slot.params.post_sampling_probs, params_base.special, tok_idx);
                 }
 
                 if (!process_token(result, slot)) {
@@ -2841,7 +2985,11 @@ struct server_context {
                 for (size_t i = 0; i < ids.size(); ++i) {
                     completion_token_output result;
 
-                    result.tok = ids[i];
+                    result.tok          = ids[i];
+                    result.text_to_send = common_token_to_piece(ctx, result.tok, params_base.special);
+                    result.prob         = 1.0f; // set later
+
+                    // TODO: set result.probs
 
                     if (!process_token(result, slot)) {
                         // release slot because of stop condition
@@ -3335,6 +3483,7 @@ int main(int argc, char ** argv) {
             { "total_slots",                 ctx_server.params_base.n_parallel },
             { "model_path",                  ctx_server.params_base.model },
             { "chat_template",               llama_get_chat_template(ctx_server.model) },
+            { "build_info",                  build_info },
         };
 
         res_ok(res, data);
@@ -3381,7 +3530,7 @@ int main(int argc, char ** argv) {
                 task.index = i;
 
                 task.prompt_tokens    = std::move(tokenized_prompts[i]);
-                task.params           = server_task::params_from_json_cmpl(ctx_server.model, ctx_server.params_base, data);
+                task.params           = server_task::params_from_json_cmpl(ctx_server.model, ctx_server.ctx, ctx_server.params_base, data);
                 task.id_selected_slot = json_value(data, "id_slot", -1);
 
                 // OAI-compat
@@ -3556,7 +3705,7 @@ int main(int argc, char ** argv) {
             {"object", "list"},
             {"data", {
                 {
-                    {"id",       params.model_alias},
+                    {"id",       params.model_alias.empty() ? params.model : params.model_alias},
                     {"object",   "model"},
                     {"created",  std::time(0)},
                     {"owned_by", "llamacpp"},
@@ -3621,34 +3770,61 @@ int main(int argc, char ** argv) {
         res_ok(res, data);
     };
 
-    const auto handle_embeddings = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
+    const auto handle_embeddings_impl = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res, bool oaicompat) {
         const json body = json::parse(req.body);
-        bool oaicompat = false;
 
-        // an input prompt can be a string or a list of tokens (integer)
+        if (oaicompat && llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) {
+            res_error(res, format_error_response("Pooling type 'none' is not OAI compatible. Please use a different pooling type", ERROR_TYPE_INVALID_REQUEST));
+            return;
+        }
+
+        // for the shape of input/content, see tokenize_input_prompts()
         json prompt;
         if (body.count("input") != 0) {
-            oaicompat = true;
             prompt = body.at("input");
-        } else if (body.count("content") != 0) {
-            // with "content", we only support single prompt
-            prompt = std::vector<std::string>{body.at("content")};
+        } else if (body.contains("content")) {
+            oaicompat = false;
+            prompt = body.at("content");
         } else {
             res_error(res, format_error_response("\"input\" or \"content\" must be provided", ERROR_TYPE_INVALID_REQUEST));
             return;
         }
 
+        bool use_base64 = false;
+        if (body.count("encoding_format") != 0) {
+            const std::string& format = body.at("encoding_format");
+            if (format == "base64") {
+                use_base64 = true;
+            } else if (format != "float") {
+                res_error(res, format_error_response("The format to return the embeddings in. Can be either float or base64", ERROR_TYPE_INVALID_REQUEST));
+                return;
+            }
+        }
+
+        std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, true, true);
+        for (const auto & tokens : tokenized_prompts) {
+            // this check is necessary for models that do not add BOS token to the input
+            if (tokens.empty()) {
+                res_error(res, format_error_response("Input content cannot be empty", ERROR_TYPE_INVALID_REQUEST));
+                return;
+            }
+        }
+
         // create and queue the task
         json responses = json::array();
         bool error = false;
         {
             std::vector<server_task> tasks;
-            std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, /* add_special */ false, true);
             for (size_t i = 0; i < tokenized_prompts.size(); i++) {
-                server_task task   = server_task(SERVER_TASK_TYPE_EMBEDDING);
+                server_task task = server_task(SERVER_TASK_TYPE_EMBEDDING);
+
                 task.id            = ctx_server.queue_tasks.get_new_id();
                 task.index         = i;
                 task.prompt_tokens = std::move(tokenized_prompts[i]);
+
+                // OAI-compat
+                task.params.oaicompat = oaicompat;
+
                 tasks.push_back(task);
             }
 
@@ -3676,12 +3852,18 @@ int main(int argc, char ** argv) {
         }
 
         // write JSON response
-        json root = oaicompat
-            ? format_embeddings_response_oaicompat(body, responses)
-            : responses.size() == 1 ? responses[0] : json(responses);
+        json root = oaicompat ? format_embeddings_response_oaicompat(body, responses, use_base64) : json(responses);
         res_ok(res, root);
     };
 
+    const auto handle_embeddings = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) {
+        handle_embeddings_impl(req, res, false);
+    };
+
+    const auto handle_embeddings_oai = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) {
+        handle_embeddings_impl(req, res, true);
+    };
+
     const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
         if (!ctx_server.params_base.reranking || ctx_server.params_base.embedding) {
             res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking` and without `--embedding`", ERROR_TYPE_NOT_SUPPORTED));
@@ -3828,8 +4010,13 @@ int main(int argc, char ** argv) {
             }
         } else {
             // using embedded static index.html
-            svr->Get("/", [](const httplib::Request &, httplib::Response & res) {
-                res.set_content(reinterpret_cast<const char*>(index_html), index_html_len, "text/html; charset=utf-8");
+            svr->Get("/", [](const httplib::Request & req, httplib::Response & res) {
+                if (req.get_header_value("Accept-Encoding").find("gzip") == std::string::npos) {
+                    res.set_content("Error: gzip is not supported by this browser", "text/plain");
+                } else {
+                    res.set_header("Content-Encoding", "gzip");
+                    res.set_content(reinterpret_cast<const char*>(index_html_gz), index_html_gz_len, "text/html; charset=utf-8");
+                }
                 return false;
             });
         }
@@ -3850,7 +4037,7 @@ int main(int argc, char ** argv) {
     svr->Post("/infill",              handle_infill);
     svr->Post("/embedding",           handle_embeddings); // legacy
     svr->Post("/embeddings",          handle_embeddings);
-    svr->Post("/v1/embeddings",       handle_embeddings);
+    svr->Post("/v1/embeddings",       handle_embeddings_oai);
     svr->Post("/rerank",              handle_rerank);
     svr->Post("/reranking",           handle_rerank);
     svr->Post("/v1/rerank",           handle_rerank);
diff --git a/examples/server/tests/unit/test_chat_completion.py b/examples/server/tests/unit/test_chat_completion.py
index 6573cc17f7b..88549708113 100644
--- a/examples/server/tests/unit/test_chat_completion.py
+++ b/examples/server/tests/unit/test_chat_completion.py
@@ -31,6 +31,7 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte
     })
     assert res.status_code == 200
     assert "cmpl" in res.body["id"] # make sure the completion id has the expected format
+    assert res.body["system_fingerprint"].startswith("b")
     assert res.body["model"] == model if model is not None else server.model_alias
     assert res.body["usage"]["prompt_tokens"] == n_prompt
     assert res.body["usage"]["completion_tokens"] == n_predicted
@@ -63,6 +64,7 @@ def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_conte
     last_cmpl_id = None
     for data in res:
         choice = data["choices"][0]
+        assert data["system_fingerprint"].startswith("b")
         assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future
         if last_cmpl_id is None:
             last_cmpl_id = data["id"]
@@ -92,7 +94,7 @@ def test_chat_completion_with_openai_library():
         seed=42,
         temperature=0.8,
     )
-    print(res)
+    assert res.system_fingerprint is not None and res.system_fingerprint.startswith("b")
     assert res.choices[0].finish_reason == "length"
     assert res.choices[0].message.content is not None
     assert match_regex("(Suddenly)+", res.choices[0].message.content)
@@ -163,3 +165,64 @@ def test_chat_completion_with_timings_per_token():
         assert "predicted_per_second" in data["timings"]
         assert "predicted_n" in data["timings"]
         assert data["timings"]["predicted_n"] <= 10
+
+
+def test_logprobs():
+    global server
+    server.start()
+    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}")
+    res = client.chat.completions.create(
+        model="gpt-3.5-turbo-instruct",
+        temperature=0.0,
+        messages=[
+            {"role": "system", "content": "Book"},
+            {"role": "user", "content": "What is the best book"},
+        ],
+        max_tokens=5,
+        logprobs=True,
+        top_logprobs=10,
+    )
+    output_text = res.choices[0].message.content
+    aggregated_text = ''
+    assert res.choices[0].logprobs is not None
+    assert res.choices[0].logprobs.content is not None
+    for token in res.choices[0].logprobs.content:
+        aggregated_text += token.token
+        assert token.logprob <= 0.0
+        assert token.bytes is not None
+        assert len(token.top_logprobs) > 0
+    assert aggregated_text == output_text
+
+
+def test_logprobs_stream():
+    global server
+    server.start()
+    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}")
+    res = client.chat.completions.create(
+        model="gpt-3.5-turbo-instruct",
+        temperature=0.0,
+        messages=[
+            {"role": "system", "content": "Book"},
+            {"role": "user", "content": "What is the best book"},
+        ],
+        max_tokens=5,
+        logprobs=True,
+        top_logprobs=10,
+        stream=True,
+    )
+    output_text = ''
+    aggregated_text = ''
+    for data in res:
+        choice = data.choices[0]
+        if choice.finish_reason is None:
+            if choice.delta.content:
+                output_text += choice.delta.content
+            assert choice.logprobs is not None
+            assert choice.logprobs.content is not None
+            for token in choice.logprobs.content:
+                aggregated_text += token.token
+                assert token.logprob <= 0.0
+                assert token.bytes is not None
+                assert token.top_logprobs is not None
+                assert len(token.top_logprobs) > 0
+    assert aggregated_text == output_text
diff --git a/examples/server/tests/unit/test_completion.py b/examples/server/tests/unit/test_completion.py
index 7f4f9cd038b..a6b21594466 100644
--- a/examples/server/tests/unit/test_completion.py
+++ b/examples/server/tests/unit/test_completion.py
@@ -10,22 +10,29 @@ def create_server():
     global server
     server = ServerPreset.tinyllama2()
 
-@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated", [
-    ("I believe the meaning of life is", 8, "(going|bed)+", 18, 8, False),
-    ("Write a joke about AI from a very long prompt which will not be truncated", 256, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False),
+@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated,return_tokens", [
+    ("I believe the meaning of life is", 8, "(going|bed)+", 18, 8, False, False),
+    ("Write a joke about AI from a very long prompt which will not be truncated", 256, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False, True),
 ])
-def test_completion(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool):
+def test_completion(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool, return_tokens: bool):
     global server
     server.start()
     res = server.make_request("POST", "/completion", data={
         "n_predict": n_predict,
         "prompt": prompt,
+        "return_tokens": return_tokens,
     })
     assert res.status_code == 200
     assert res.body["timings"]["prompt_n"] == n_prompt
     assert res.body["timings"]["predicted_n"] == n_predicted
     assert res.body["truncated"] == truncated
+    assert type(res.body["has_new_line"]) == bool
     assert match_regex(re_content, res.body["content"])
+    if return_tokens:
+        assert len(res.body["tokens"]) > 0
+        assert all(type(tok) == int for tok in res.body["tokens"])
+    else:
+        assert res.body["tokens"] == []
 
 
 @pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated", [
@@ -48,12 +55,15 @@ def test_completion_stream(prompt: str, n_predict: int, re_content: str, n_promp
             assert data["timings"]["predicted_n"] == n_predicted
             assert data["truncated"] == truncated
             assert data["stop_type"] == "limit"
+            assert type(data["has_new_line"]) == bool
             assert "generation_settings" in data
             assert server.n_predict is not None
             assert data["generation_settings"]["n_predict"] == min(n_predict, server.n_predict)
             assert data["generation_settings"]["seed"] == server.seed
             assert match_regex(re_content, content)
         else:
+            assert len(data["tokens"]) > 0
+            assert all(type(tok) == int for tok in data["tokens"])
             content += data["content"]
 
 
@@ -85,7 +95,7 @@ def test_consistent_result_same_seed(n_slots: int):
         res = server.make_request("POST", "/completion", data={
             "prompt": "I believe the meaning of life is",
             "seed": 42,
-            "temperature": 1.0,
+            "temperature": 0.0,
             "cache_prompt": False,  # TODO: remove this once test_cache_vs_nocache_prompt is fixed
         })
         if last_res is not None:
@@ -110,9 +120,10 @@ def test_different_result_different_seed(n_slots: int):
             assert res.body["content"] != last_res.body["content"]
         last_res = res
 
-
+# TODO figure why it don't work with temperature = 1
+# @pytest.mark.parametrize("temperature", [0.0, 1.0])
 @pytest.mark.parametrize("n_batch", [16, 32])
-@pytest.mark.parametrize("temperature", [0.0, 1.0])
+@pytest.mark.parametrize("temperature", [0.0])
 def test_consistent_result_different_batch_size(n_batch: int, temperature: float):
     global server
     server.n_batch = n_batch
@@ -247,6 +258,40 @@ def check_slots_status():
         # assert match_regex(re_content, res.body["content"])
 
 
+@pytest.mark.parametrize(
+    "prompt,n_predict,response_fields",
+    [
+        ("I believe the meaning of life is", 8, []),
+        ("I believe the meaning of life is", 32, ["content", "generation_settings/n_predict", "prompt"]),
+    ],
+)
+def test_completion_response_fields(
+    prompt: str, n_predict: int, response_fields: list[str]
+):
+    global server
+    server.start()
+    res = server.make_request(
+        "POST",
+        "/completion",
+        data={
+            "n_predict": n_predict,
+            "prompt": prompt,
+            "response_fields": response_fields,
+        },
+    )
+    assert res.status_code == 200
+    assert "content" in res.body
+    assert len(res.body["content"])
+    if len(response_fields):
+        assert res.body["generation_settings/n_predict"] == n_predict
+        assert res.body["prompt"] == "<s> " + prompt
+        assert isinstance(res.body["content"], str)
+        assert len(res.body) == len(response_fields)
+    else:
+        assert len(res.body)
+        assert "generation_settings" in res.body
+
+
 def test_n_probs():
     global server
     server.start()
@@ -260,9 +305,68 @@ def test_n_probs():
     assert "completion_probabilities" in res.body
     assert len(res.body["completion_probabilities"]) == 5
     for tok in res.body["completion_probabilities"]:
-        assert "probs" in tok
-        assert len(tok["probs"]) == 10
-        for prob in tok["probs"]:
-            assert "prob" in prob
-            assert "tok_str" in prob
-            assert 0.0 <= prob["prob"] <= 1.0
+        assert "id" in tok and tok["id"] > 0
+        assert "token" in tok and type(tok["token"]) == str
+        assert "logprob" in tok and tok["logprob"] <= 0.0
+        assert "bytes" in tok and type(tok["bytes"]) == list
+        assert len(tok["top_logprobs"]) == 10
+        for prob in tok["top_logprobs"]:
+            assert "id" in prob and prob["id"] > 0
+            assert "token" in prob and type(prob["token"]) == str
+            assert "logprob" in prob and prob["logprob"] <= 0.0
+            assert "bytes" in prob and type(prob["bytes"]) == list
+
+
+def test_n_probs_stream():
+    global server
+    server.start()
+    res = server.make_stream_request("POST", "/completion", data={
+        "prompt": "I believe the meaning of life is",
+        "n_probs": 10,
+        "temperature": 0.0,
+        "n_predict": 5,
+        "stream": True,
+    })
+    for data in res:
+        if data["stop"] == False:
+            assert "completion_probabilities" in data
+            assert len(data["completion_probabilities"]) == 1
+            for tok in data["completion_probabilities"]:
+                assert "id" in tok and tok["id"] > 0
+                assert "token" in tok and type(tok["token"]) == str
+                assert "logprob" in tok and tok["logprob"] <= 0.0
+                assert "bytes" in tok and type(tok["bytes"]) == list
+                assert len(tok["top_logprobs"]) == 10
+                for prob in tok["top_logprobs"]:
+                    assert "id" in prob and prob["id"] > 0
+                    assert "token" in prob and type(prob["token"]) == str
+                    assert "logprob" in prob and prob["logprob"] <= 0.0
+                    assert "bytes" in prob and type(prob["bytes"]) == list
+
+
+def test_n_probs_post_sampling():
+    global server
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "I believe the meaning of life is",
+        "n_probs": 10,
+        "temperature": 0.0,
+        "n_predict": 5,
+        "post_sampling_probs": True,
+    })
+    assert res.status_code == 200
+    assert "completion_probabilities" in res.body
+    assert len(res.body["completion_probabilities"]) == 5
+    for tok in res.body["completion_probabilities"]:
+        assert "id" in tok and tok["id"] > 0
+        assert "token" in tok and type(tok["token"]) == str
+        assert "prob" in tok and 0.0 < tok["prob"] <= 1.0
+        assert "bytes" in tok and type(tok["bytes"]) == list
+        assert len(tok["top_probs"]) == 10
+        for prob in tok["top_probs"]:
+            assert "id" in prob and prob["id"] > 0
+            assert "token" in prob and type(prob["token"]) == str
+            assert "prob" in prob and 0.0 <= prob["prob"] <= 1.0
+            assert "bytes" in prob and type(prob["bytes"]) == list
+        # because the test model usually output token with either 100% or 0% probability, we need to check all the top_probs
+        assert any(prob["prob"] == 1.0 for prob in tok["top_probs"])
diff --git a/examples/server/tests/unit/test_embedding.py b/examples/server/tests/unit/test_embedding.py
index fc7c20064dd..8b0eb42b092 100644
--- a/examples/server/tests/unit/test_embedding.py
+++ b/examples/server/tests/unit/test_embedding.py
@@ -1,3 +1,5 @@
+import base64
+import struct
 import pytest
 from openai import OpenAI
 from utils import *
@@ -14,8 +16,9 @@ def create_server():
 
 def test_embedding_single():
     global server
+    server.pooling = 'last'
     server.start()
-    res = server.make_request("POST", "/embeddings", data={
+    res = server.make_request("POST", "/v1/embeddings", data={
         "input": "I believe the meaning of life is",
     })
     assert res.status_code == 200
@@ -29,8 +32,9 @@ def test_embedding_single():
 
 def test_embedding_multiple():
     global server
+    server.pooling = 'last'
     server.start()
-    res = server.make_request("POST", "/embeddings", data={
+    res = server.make_request("POST", "/v1/embeddings", data={
         "input": [
             "I believe the meaning of life is",
             "Write a joke about AI from a very long prompt which will not be truncated",
@@ -45,10 +49,72 @@ def test_embedding_multiple():
         assert len(d['embedding']) > 1
 
 
+@pytest.mark.parametrize(
+    "input,is_multi_prompt",
+    [
+        # do not crash on empty input
+        ("", False),
+        # single prompt
+        ("string", False),
+        ([12, 34, 56], False),
+        ([12, 34, "string", 56, 78], False),
+        # multiple prompts
+        (["string1", "string2"], True),
+        (["string1", [12, 34, 56]], True),
+        ([[12, 34, 56], [12, 34, 56]], True),
+        ([[12, 34, 56], [12, "string", 34, 56]], True),
+    ]
+)
+def test_embedding_mixed_input(input, is_multi_prompt: bool):
+    global server
+    server.start()
+    res = server.make_request("POST", "/v1/embeddings", data={"input": input})
+    assert res.status_code == 200
+    data = res.body['data']
+    if is_multi_prompt:
+        assert len(data) == len(input)
+        for d in data:
+            assert 'embedding' in d
+            assert len(d['embedding']) > 1
+    else:
+        assert 'embedding' in data[0]
+        assert len(data[0]['embedding']) > 1
+
+
+def test_embedding_pooling_none():
+    global server
+    server.pooling = 'none'
+    server.start()
+    res = server.make_request("POST", "/embeddings", data={
+        "input": "hello hello hello",
+    })
+    assert res.status_code == 200
+    assert 'embedding' in res.body[0]
+    assert len(res.body[0]['embedding']) == 5 # 3 text tokens + 2 special
+
+    # make sure embedding vector is not normalized
+    for x in res.body[0]['embedding']:
+        assert abs(sum([x ** 2 for x in x]) - 1) > EPSILON
+
+
+def test_embedding_pooling_none_oai():
+    global server
+    server.pooling = 'none'
+    server.start()
+    res = server.make_request("POST", "/v1/embeddings", data={
+        "input": "hello hello hello",
+    })
+
+    # /v1/embeddings does not support pooling type 'none'
+    assert res.status_code == 400
+    assert "error" in res.body
+
+
 def test_embedding_openai_library_single():
     global server
+    server.pooling = 'last'
     server.start()
-    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}")
+    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
     res = client.embeddings.create(model="text-embedding-3-small", input="I believe the meaning of life is")
     assert len(res.data) == 1
     assert len(res.data[0].embedding) > 1
@@ -56,8 +122,9 @@ def test_embedding_openai_library_single():
 
 def test_embedding_openai_library_multiple():
     global server
+    server.pooling = 'last'
     server.start()
-    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}")
+    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
     res = client.embeddings.create(model="text-embedding-3-small", input=[
         "I believe the meaning of life is",
         "Write a joke about AI from a very long prompt which will not be truncated",
@@ -71,8 +138,9 @@ def test_embedding_openai_library_multiple():
 
 def test_embedding_error_prompt_too_long():
     global server
+    server.pooling = 'last'
     server.start()
-    res = server.make_request("POST", "/embeddings", data={
+    res = server.make_request("POST", "/v1/embeddings", data={
         "input": "This is a test " * 512,
     })
     assert res.status_code != 200
@@ -80,8 +148,9 @@ def test_embedding_error_prompt_too_long():
 
 
 def test_same_prompt_give_same_result():
+    server.pooling = 'last'
     server.start()
-    res = server.make_request("POST", "/embeddings", data={
+    res = server.make_request("POST", "/v1/embeddings", data={
         "input": [
             "I believe the meaning of life is",
             "I believe the meaning of life is",
@@ -97,3 +166,72 @@ def test_same_prompt_give_same_result():
         vi = res.body['data'][i]['embedding']
         for x, y in zip(v0, vi):
             assert abs(x - y) < EPSILON
+
+
+@pytest.mark.parametrize(
+    "content,n_tokens",
+    [
+        ("I believe the meaning of life is", 9),
+        ("This is a test", 6),
+    ]
+)
+def test_embedding_usage_single(content, n_tokens):
+    global server
+    server.start()
+    res = server.make_request("POST", "/v1/embeddings", data={"input": content})
+    assert res.status_code == 200
+    assert res.body['usage']['prompt_tokens'] == res.body['usage']['total_tokens']
+    assert res.body['usage']['prompt_tokens'] == n_tokens
+
+
+def test_embedding_usage_multiple():
+    global server
+    server.start()
+    res = server.make_request("POST", "/v1/embeddings", data={
+        "input": [
+            "I believe the meaning of life is",
+            "I believe the meaning of life is",
+        ],
+    })
+    assert res.status_code == 200
+    assert res.body['usage']['prompt_tokens'] == res.body['usage']['total_tokens']
+    assert res.body['usage']['prompt_tokens'] == 2 * 9
+
+
+def test_embedding_openai_library_base64():
+    server.start()
+    test_input = "Test base64 embedding output"
+
+    # get embedding in default format
+    res = server.make_request("POST", "/v1/embeddings", data={
+        "input": test_input
+    })
+    assert res.status_code == 200
+    vec0 = res.body["data"][0]["embedding"]
+
+    # get embedding in base64 format
+    res = server.make_request("POST", "/v1/embeddings", data={
+        "input": test_input,
+        "encoding_format": "base64"
+    })
+
+    assert res.status_code == 200
+    assert "data" in res.body
+    assert len(res.body["data"]) == 1
+
+    embedding_data = res.body["data"][0]
+    assert "embedding" in embedding_data
+    assert isinstance(embedding_data["embedding"], str)
+
+    # Verify embedding is valid base64
+    decoded = base64.b64decode(embedding_data["embedding"])
+    # Verify decoded data can be converted back to float array
+    float_count = len(decoded) // 4  # 4 bytes per float
+    floats = struct.unpack(f'{float_count}f', decoded)
+    assert len(floats) > 0
+    assert all(isinstance(x, float) for x in floats)
+    assert len(floats) == len(vec0)
+
+    # make sure the decoded data is the same as the original
+    for x, y in zip(floats, vec0):
+        assert abs(x - y) < EPSILON
diff --git a/examples/server/tests/unit/test_rerank.py b/examples/server/tests/unit/test_rerank.py
index 189bc4c9623..7203d794357 100644
--- a/examples/server/tests/unit/test_rerank.py
+++ b/examples/server/tests/unit/test_rerank.py
@@ -53,3 +53,26 @@ def test_invalid_rerank_req(documents):
     })
     assert res.status_code == 400
     assert "error" in res.body
+
+
+@pytest.mark.parametrize(
+    "query,doc1,doc2,n_tokens",
+    [
+        ("Machine learning is", "A machine", "Learning is", 19),
+        ("Which city?", "Machine learning is ", "Paris, capitale de la", 26),
+    ]
+)
+def test_rerank_usage(query, doc1, doc2, n_tokens):
+    global server
+    server.start()
+
+    res = server.make_request("POST", "/rerank", data={
+        "query": query,
+        "documents": [
+            doc1,
+            doc2,
+        ]
+    })
+    assert res.status_code == 200
+    assert res.body['usage']['prompt_tokens'] == res.body['usage']['total_tokens']
+    assert res.body['usage']['prompt_tokens'] == n_tokens
diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py
index d988ccf5e30..277125e88b5 100644
--- a/examples/server/tests/utils.py
+++ b/examples/server/tests/utils.py
@@ -65,6 +65,7 @@ class ServerProcess:
     server_reranking: bool | None = False
     server_metrics: bool | None = False
     server_slots: bool | None = False
+    pooling: str | None = None
     draft: int | None = None
     api_key: str | None = None
     response_format: str | None = None
@@ -132,6 +133,8 @@ def start(self, timeout_seconds: int = 10) -> None:
             server_args.append("--metrics")
         if self.server_slots:
             server_args.append("--slots")
+        if self.pooling:
+            server_args.extend(["--pooling", self.pooling])
         if self.model_alias:
             server_args.extend(["--alias", self.model_alias])
         if self.n_ctx:
diff --git a/examples/server/themes/buttons-top/index.html b/examples/server/themes/buttons-top/index.html
index 2797c37c964..3fb88fcc88d 100644
--- a/examples/server/themes/buttons-top/index.html
+++ b/examples/server/themes/buttons-top/index.html
@@ -222,7 +222,6 @@
       temperature: 0.7,
       repeat_last_n: 256, // 0 = disable penalty, -1 = context size
       repeat_penalty: 1.18, // 1.0 = disabled
-      penalize_nl: false,
       top_k: 40, // <= 0 to use vocab size
       top_p: 0.95, // 1.0 = disabled
       min_p: 0.05, // 0 = disabled
@@ -779,7 +778,6 @@
             ${FloatField({ label: "Temperature", max: 2.0, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })}
             ${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })}
             ${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })}
-            ${BoolField({ label: "Penalize repetition of newlines", name: "penalize_nl", value: params.value.penalize_nl })}
             ${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })}
             ${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })}
             ${FloatField({ label: "Min-P sampling", max: 1.0, min: 0.0, name: "min_p", step: 0.01, value: params.value.min_p })}
diff --git a/examples/server/themes/wild/index.html b/examples/server/themes/wild/index.html
index dbe23c40241..73f36d4b29f 100644
--- a/examples/server/themes/wild/index.html
+++ b/examples/server/themes/wild/index.html
@@ -225,7 +225,6 @@
       temperature: 0.7,
       repeat_last_n: 256, // 0 = disable penalty, -1 = context size
       repeat_penalty: 1.18, // 1.0 = disabled
-      penalize_nl: false,
       top_k: 40, // <= 0 to use vocab size
       top_p: 0.95, // 1.0 = disabled
       min_p: 0.05, // 0 = disabled
@@ -782,7 +781,6 @@
             ${FloatField({ label: "Temperature", max: 2.0, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })}
             ${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })}
             ${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })}
-            ${BoolField({ label: "Penalize repetition of newlines", name: "penalize_nl", value: params.value.penalize_nl })}
             ${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })}
             ${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })}
             ${FloatField({ label: "Min-P sampling", max: 1.0, min: 0.0, name: "min_p", step: 0.01, value: params.value.min_p })}
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 8f545aea52d..334f2f19207 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -3,6 +3,7 @@
 #include "common.h"
 #include "log.h"
 #include "llama.h"
+#include "common/base64.hpp"
 
 #ifndef NDEBUG
 // crash the server in debug mode, otherwise send an http 500 error
@@ -22,7 +23,7 @@
 #include <vector>
 #include <memory>
 
-#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
+#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo"
 
 using json = nlohmann::ordered_json;
 
@@ -56,6 +57,8 @@ static T json_value(const json & body, const std::string & key, const T & defaul
     }
 }
 
+const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
+
 //
 // tokenizer and input processing utils
 //
@@ -88,6 +91,28 @@ static bool json_is_array_of_mixed_numbers_strings(const json & data) {
     return false;
 }
 
+// get value by path(key1 / key2)
+static json json_get_nested_values(const std::vector<std::string> & paths, const json & js) {
+    json result = json::object();
+
+    for (const std::string & path : paths) {
+        json current = js;
+        const auto keys = string_split<std::string>(path, /*separator*/ '/');
+        bool valid_path = true;
+        for (const std::string & k : keys) {
+            if (valid_path && current.is_object() && current.contains(k)) {
+                current = current[k];
+            } else {
+                valid_path = false;
+            }
+        }
+        if (valid_path) {
+            result[path] = current;
+        }
+    }
+    return result;
+}
+
 /**
  * this handles 2 cases:
  * - only string, example: "string"
@@ -138,6 +163,7 @@ static llama_tokens tokenize_mixed(const llama_context * ctx, const json & json_
  * and multiple prompts (multi-tasks):
  * - "prompt": ["string1", "string2"]
  * - "prompt": ["string1", [12, 34, 56]]
+ * - "prompt": [[12, 34, 56], [78, 90, 12]]
  * - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56]]
  */
 static std::vector<llama_tokens> tokenize_input_prompts(llama_context * ctx, const json & json_prompt, bool add_special, bool parse_special) {
@@ -170,6 +196,36 @@ static std::vector<llama_tokens> tokenize_input_prompts(llama_context * ctx, con
     return result;
 }
 
+// return the last index of character that can form a valid string
+// if the last character is potentially cut in half, return the index before the cut
+// if validate_utf8(text) == text.size(), then the whole text is valid utf8
+static size_t validate_utf8(const std::string& text) {
+    size_t len = text.size();
+    if (len == 0) return 0;
+
+    // Check the last few bytes to see if a multi-byte character is cut off
+    for (size_t i = 1; i <= 4 && i <= len; ++i) {
+        unsigned char c = text[len - i];
+        // Check for start of a multi-byte sequence from the end
+        if ((c & 0xE0) == 0xC0) {
+            // 2-byte character start: 110xxxxx
+            // Needs at least 2 bytes
+            if (i < 2) return len - i;
+        } else if ((c & 0xF0) == 0xE0) {
+            // 3-byte character start: 1110xxxx
+            // Needs at least 3 bytes
+            if (i < 3) return len - i;
+        } else if ((c & 0xF8) == 0xF0) {
+            // 4-byte character start: 11110xxx
+            // Needs at least 4 bytes
+            if (i < 4) return len - i;
+        }
+    }
+
+    // If no cut-off multi-byte character is found, return full length
+    return len;
+}
+
 //
 // template utils
 //
@@ -333,7 +389,7 @@ static std::string llama_get_chat_template(const struct llama_model * model) {
     if (res < 2) {
         return "";
     } else {
-        std::vector<char> model_template(res, 0);
+        std::vector<char> model_template(res + 1, 0);
         llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
         return std::string(model_template.data(), model_template.size() - 1);
     }
@@ -558,23 +614,41 @@ static json oaicompat_completion_params_parse(
     return llama_params;
 }
 
-static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) {
+static json format_embeddings_response_oaicompat(const json & request, const json & embeddings, bool use_base64 = false) {
     json data = json::array();
+    int32_t n_tokens = 0;
     int i = 0;
     for (const auto & elem : embeddings) {
-        data.push_back(json{
-            {"embedding", json_value(elem, "embedding", json::array())},
-            {"index",     i++},
-            {"object",    "embedding"}
-        });
+        json embedding_obj;
+
+        if (use_base64) {
+            const auto& vec = json_value(elem, "embedding", json::array()).get<std::vector<float>>();
+            const char* data_ptr = reinterpret_cast<const char*>(vec.data());
+            size_t data_size = vec.size() * sizeof(float);
+            embedding_obj = {
+                {"embedding", base64::encode(data_ptr, data_size)},
+                {"index", i++},
+                {"object", "embedding"},
+                {"encoding_format", "base64"}
+            };
+        } else {
+            embedding_obj = {
+                {"embedding", json_value(elem, "embedding", json::array())},
+                {"index", i++},
+                {"object", "embedding"}
+            };
+        }
+        data.push_back(embedding_obj);
+
+        n_tokens += json_value(elem, "tokens_evaluated", 0);
     }
 
     json res = json {
         {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
         {"object", "list"},
-        {"usage", json { // TODO: fill
-            {"prompt_tokens", 0},
-            {"total_tokens", 0}
+        {"usage", json {
+            {"prompt_tokens", n_tokens},
+            {"total_tokens", n_tokens}
         }},
         {"data", data}
     };
@@ -584,20 +658,23 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
 
 static json format_response_rerank(const json & request, const json & ranks) {
     json data = json::array();
+    int32_t n_tokens = 0;
     int i = 0;
     for (const auto & rank : ranks) {
         data.push_back(json{
             {"index",    i++},
             {"relevance_score", json_value(rank, "score", 0.0)},
         });
+
+        n_tokens += json_value(rank, "tokens_evaluated", 0);
     }
 
     json res = json {
         {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
         {"object", "list"},
-        {"usage", json { // TODO: fill
-            {"prompt_tokens", 0},
-            {"total_tokens", 0}
+        {"usage", json {
+            {"prompt_tokens", n_tokens},
+            {"total_tokens", n_tokens}
         }},
         {"results", data}
     };
@@ -664,3 +741,33 @@ static json format_logit_bias(const std::vector<llama_logit_bias> & logit_bias)
 static std::string safe_json_to_str(json data) {
     return data.dump(-1, ' ', false, json::error_handler_t::replace);
 }
+
+static std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int idx) {
+    std::vector<llama_token_data> cur;
+    const auto * logits = llama_get_logits_ith(ctx, idx);
+    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
+
+    cur.resize(n_vocab);
+    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+        cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
+    }
+
+    // sort tokens by logits
+    std::sort(cur.begin(), cur.end(), [](const llama_token_data & a, const llama_token_data & b) {
+        return a.logit > b.logit;
+    });
+
+    // apply softmax
+    float max_l = cur[0].logit;
+    float cum_sum = 0.0f;
+    for (size_t i = 0; i < cur.size(); ++i) {
+        float p = expf(cur[i].logit - max_l);
+        cur[i].p = p;
+        cum_sum += p;
+    }
+    for (size_t i = 0; i < cur.size(); ++i) {
+        cur[i].p /= cum_sum;
+    }
+
+    return cur;
+}
diff --git a/examples/server/webui/index.html b/examples/server/webui/index.html
index c7e18b45e1a..dcdd410797d 100644
--- a/examples/server/webui/index.html
+++ b/examples/server/webui/index.html
@@ -15,7 +15,7 @@
       <!-- sidebar -->
       <div class="drawer-side h-screen lg:h-screen z-50 lg:max-w-64">
         <label for="toggle-drawer" aria-label="close sidebar" class="drawer-overlay"></label>
-        <div class="flex flex-col bg-base-200 min-h-full max-w-[calc(100vw-2em)] py-4 px-4">
+        <div class="flex flex-col bg-base-200 min-h-full max-w-64 py-4 px-4">
           <div class="flex flex-row items-center justify-between mb-4 mt-4">
             <h2 class="font-bold ml-4">Conversations</h2>
 
@@ -120,51 +120,25 @@ <h2 class="font-bold ml-4">Conversations</h2>
             {{ messages.length === 0 ? 'Send a message to start' : '' }}
           </div>
           <div v-for="msg in messages" class="group">
-            <div :class="{
-              'chat': true,
-              'chat-start': msg.role !== 'user',
-              'chat-end': msg.role === 'user',
-            }">
-              <div :class="{
-                'chat-bubble markdown': true,
-                'chat-bubble-base-300': msg.role !== 'user',
-              }">
-                <!-- textarea for editing message -->
-                <template v-if="editingMsg && editingMsg.id === msg.id">
-                  <textarea
-                    class="textarea textarea-bordered bg-base-100 text-base-content w-[calc(90vw-8em)] lg:w-96"
-                    v-model="msg.content"></textarea>
-                  <br/>
-                  <button class="btn btn-ghost mt-2 mr-2" @click="editingMsg = null">Cancel</button>
-                  <button class="btn mt-2" @click="editUserMsgAndRegenerate(msg)">Submit</button>
-                </template>
-                <!-- render message as markdown -->
-                <vue-markdown v-else :source="msg.content" />
-              </div>
-            </div>
-
-            <!-- actions for each message -->
-            <div :class="{'text-right': msg.role === 'user'}" class="mx-4 mt-2 mb-2">
-              <!-- user message -->
-              <button v-if="msg.role === 'user'" class="badge btn-mini show-on-hover" @click="editingMsg = msg" :disabled="isGenerating">
-                ✍️ Edit
-              </button>
-              <!-- assistant message -->
-              <button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="regenerateMsg(msg)" :disabled="isGenerating">
-                🔄 Regenerate
-              </button>
-              <button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="copyMsg(msg)" :disabled="isGenerating">
-                📋 Copy
-              </button>
-            </div>
+            <message-bubble
+              :config="config"
+              :msg="msg"
+              :key="msg.id"
+              :is-generating="isGenerating"
+              :edit-user-msg-and-regenerate="editUserMsgAndRegenerate"
+              :regenerate-msg="regenerateMsg"></message-bubble>
           </div>
 
           <!-- pending (ongoing) assistant message -->
-          <div id="pending-msg" class="chat chat-start">
-            <div v-if="pendingMsg" class="chat-bubble markdown chat-bubble-base-300">
-              <span v-if="!pendingMsg.content" class="loading loading-dots loading-md"></span>
-              <vue-markdown v-else :source="pendingMsg.content" />
-            </div>
+          <div id="pending-msg" class="group">
+            <message-bubble
+              v-if="pendingMsg"
+              :config="config"
+              :msg="pendingMsg"
+              :key="pendingMsg.id"
+              :is-generating="isGenerating"
+              :edit-user-msg-and-regenerate="() => {}"
+              :regenerate-msg="() => {}"></message-bubble>
           </div>
         </div>
 
@@ -227,6 +201,14 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
           <details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
             <summary class="collapse-title font-bold">Advanced config</summary>
             <div class="collapse-content">
+              <div class="flex flex-row items-center mb-2" v-if="isDev">
+                <!-- this button only shows in dev mode, used to import a demo conversation to test message rendering -->
+                <button class="btn" @click="debugImportDemoConv()">(debug) Import demo conversation</button>
+              </div>
+              <div class="flex flex-row items-center mb-2">
+                <input type="checkbox" class="checkbox" v-model="config.showTokensPerSecond" />
+                <span class="ml-4">Show tokens per second</span>
+              </div>
               <label class="form-control mb-2">
                 <!-- Custom parameters input -->
                 <div class="label inline">Custom JSON config (For more info, refer to <a class="underline" href="https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md" target="_blank" rel="noopener noreferrer">server documentation</a>)</div>
@@ -247,6 +229,66 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
 
   </div>
 
+
+  <!-- Template to be used as message bubble -->
+  <template id="message-bubble">
+    <div :class="{
+      'chat': true,
+      'chat-start': msg.role !== 'user',
+      'chat-end': msg.role === 'user',
+    }">
+      <div :class="{
+        'chat-bubble markdown': true,
+        'chat-bubble-base-300': msg.role !== 'user',
+      }">
+        <!-- textarea for editing message -->
+        <template v-if="editingContent !== null">
+          <textarea
+            class="textarea textarea-bordered bg-base-100 text-base-content w-[calc(90vw-8em)] lg:w-96"
+            v-model="editingContent"></textarea>
+          <br/>
+          <button class="btn btn-ghost mt-2 mr-2" @click="editingContent = null">Cancel</button>
+          <button class="btn mt-2" @click="editMsg()">Submit</button>
+        </template>
+        <template v-else>
+          <!-- show loading dots for pending message -->
+          <span v-if="msg.content === null" class="loading loading-dots loading-md"></span>
+          <!-- render message as markdown -->
+          <vue-markdown v-else :source="msg.content"></vue-markdown>
+          <!-- render timings if enabled -->
+          <div class="dropdown dropdown-hover dropdown-top mt-2" v-if="timings && config.showTokensPerSecond">
+            <div tabindex="0" role="button" class="cursor-pointer font-semibold text-sm opacity-60">Speed: {{ timings.predicted_per_second.toFixed(1) }} t/s</div>
+            <div class="dropdown-content bg-base-100 z-10 w-64 p-2 shadow mt-4">
+              <b>Prompt</b><br/>
+              - Tokens: {{ timings.prompt_n }}<br/>
+              - Time: {{ timings.prompt_ms }} ms<br/>
+              - Speed: {{ timings.prompt_per_second.toFixed(1) }} t/s<br/>
+              <b>Generation</b><br/>
+              - Tokens: {{ timings.predicted_n }}<br/>
+              - Time: {{ timings.predicted_ms }} ms<br/>
+              - Speed: {{ timings.predicted_per_second.toFixed(1) }} t/s<br/>
+            </div>
+          </div>
+        </template>
+      </div>
+    </div>
+    <!-- actions for each message -->
+    <div :class="{'text-right': msg.role === 'user', 'opacity-0': isGenerating}" class="mx-4 mt-2 mb-2">
+      <!-- user message -->
+      <button v-if="msg.role === 'user'" class="badge btn-mini show-on-hover" @click="editingContent = msg.content" :disabled="isGenerating">
+        ✍️ Edit
+      </button>
+      <!-- assistant message -->
+      <button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="regenerateMsg(msg)" :disabled="isGenerating">
+        🔄 Regenerate
+      </button>
+      <button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="copyMsg()" :disabled="isGenerating">
+        📋 Copy
+      </button>
+    </div>
+  </template>
+
+
   <!-- Template to be used by settings modal -->
   <template id="settings-modal-short-input">
     <label class="input input-bordered join-item grow flex items-center gap-2 mb-2">
diff --git a/examples/server/webui/package-lock.json b/examples/server/webui/package-lock.json
index 6b93090f062..bbebccbf20c 100644
--- a/examples/server/webui/package-lock.json
+++ b/examples/server/webui/package-lock.json
@@ -8,15 +8,21 @@
       "name": "webui",
       "version": "0.0.0",
       "dependencies": {
+        "@sec-ant/readable-stream": "^0.6.0",
+        "@vscode/markdown-it-katex": "^1.1.1",
         "autoprefixer": "^10.4.20",
         "daisyui": "^4.12.14",
+        "highlight.js": "^11.10.0",
+        "katex": "^0.16.15",
         "markdown-it": "^14.1.0",
         "postcss": "^8.4.49",
         "tailwindcss": "^3.4.15",
+        "textlinestream": "^1.1.1",
         "vite-plugin-singlefile": "^2.0.3",
         "vue": "^3.5.13"
       },
       "devDependencies": {
+        "sass-embedded": "^1.83.0",
         "vite": "^5.4.10"
       }
     },
@@ -32,6 +38,13 @@
         "url": "https://github.com/sponsors/sindresorhus"
       }
     },
+    "node_modules/@bufbuild/protobuf": {
+      "version": "2.2.3",
+      "resolved": "https://registry.npmjs.org/@bufbuild/protobuf/-/protobuf-2.2.3.tgz",
+      "integrity": "sha512-tFQoXHJdkEOSwj5tRIZSPNUuXK3RaR7T1nUrPgbYX1pUbvqqaaZAsfo+NXBPsz5rZMSKVFrgK1WL8Q/MSLvprg==",
+      "devOptional": true,
+      "license": "(Apache-2.0 AND BSD-3-Clause)"
+    },
     "node_modules/@esbuild/aix-ppc64": {
       "version": "0.21.5",
       "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.21.5.tgz",
@@ -605,6 +618,21 @@
         "win32"
       ]
     },
+    "node_modules/@sec-ant/readable-stream": {
+      "version": "0.6.0",
+      "resolved": "https://registry.npmjs.org/@sec-ant/readable-stream/-/readable-stream-0.6.0.tgz",
+      "integrity": "sha512-uiBh8DrB5FN35gP6/o8JEhEQ7/ci1jUsOZO/VMUjyvTpjtV54VstOXVj1TvTj/wsT23pfX6butxxh3qufsW3+g==",
+      "license": "MIT"
+    },
+    "node_modules/@vscode/markdown-it-katex": {
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/@vscode/markdown-it-katex/-/markdown-it-katex-1.1.1.tgz",
+      "integrity": "sha512-3KTlbsRBPJQLE2YmLL7K6nunTlU+W9T5+FjfNdWuIUKgxSS6HWLQHaO3L4MkJi7z7MpIPpY+g4N+cWNBPE/MSA==",
+      "license": "MIT",
+      "dependencies": {
+        "katex": "^0.16.4"
+      }
+    },
     "node_modules/@vue/compiler-dom": {
       "version": "3.5.13",
       "resolved": "https://registry.npmjs.org/@vue/compiler-dom/-/compiler-dom-3.5.13.tgz",
@@ -1003,6 +1031,13 @@
         "browserslist": ">= 4.21.0"
       }
     },
+    "node_modules/buffer-builder": {
+      "version": "0.2.0",
+      "resolved": "https://registry.npmjs.org/buffer-builder/-/buffer-builder-0.2.0.tgz",
+      "integrity": "sha512-7VPMEPuYznPSoR21NE1zvd2Xna6c/CloiZCfcMXR1Jny6PjX0N4Nsa38zcBFo/FMK+BlA+FLKbJCQ0i2yxp+Xg==",
+      "devOptional": true,
+      "license": "MIT/X11"
+    },
     "node_modules/caniuse-lite": {
       "version": "1.0.30001684",
       "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001684.tgz",
@@ -1165,6 +1200,22 @@
         "node": ">=8.0"
       }
     },
+    "node_modules/colorjs.io": {
+      "version": "0.5.2",
+      "resolved": "https://registry.npmjs.org/colorjs.io/-/colorjs.io-0.5.2.tgz",
+      "integrity": "sha512-twmVoizEW7ylZSN32OgKdXRmo1qg+wT5/6C3xu5b9QsWzSFAhHLn2xd8ro0diCsKfCj1RdaTP/nrcW+vAoQPIw==",
+      "devOptional": true,
+      "license": "MIT"
+    },
+    "node_modules/commander": {
+      "version": "8.3.0",
+      "resolved": "https://registry.npmjs.org/commander/-/commander-8.3.0.tgz",
+      "integrity": "sha512-OkTL9umf+He2DZkUq8f8J9of7yL6RJKI24dVITBmNfZBmri9zYZQrKkuXiKhyfPSu8tUhnVBB1iKXevvnlR4Ww==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 12"
+      }
+    },
     "node_modules/css-selector-tokenizer": {
       "version": "0.8.0",
       "resolved": "https://registry.npmjs.org/css-selector-tokenizer/-/css-selector-tokenizer-0.8.0.tgz",
@@ -1472,6 +1523,31 @@
         "node": ">=10.13.0"
       }
     },
+    "node_modules/has-flag": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+      "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+      "devOptional": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/highlight.js": {
+      "version": "11.10.0",
+      "resolved": "https://registry.npmjs.org/highlight.js/-/highlight.js-11.10.0.tgz",
+      "integrity": "sha512-SYVnVFswQER+zu1laSya563s+F8VDGt7o35d4utbamowvUNLLMovFqwCLSocpZTz3MgaSRA1IbqRWZv97dtErQ==",
+      "engines": {
+        "node": ">=12.0.0"
+      }
+    },
+    "node_modules/immutable": {
+      "version": "5.0.3",
+      "resolved": "https://registry.npmjs.org/immutable/-/immutable-5.0.3.tgz",
+      "integrity": "sha512-P8IdPQHq3lA1xVeBRi5VPqUm5HDgKnx0Ru51wZz5mjxHr5n3RWhjIpOFU7ybkUxfB+5IToy+OLaHYDBIWsv+uw==",
+      "devOptional": true,
+      "license": "MIT"
+    },
     "node_modules/is-glob": {
       "version": "4.0.3",
       "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.3.tgz",
@@ -1502,6 +1578,22 @@
         "jiti": "bin/jiti.js"
       }
     },
+    "node_modules/katex": {
+      "version": "0.16.15",
+      "resolved": "https://registry.npmjs.org/katex/-/katex-0.16.15.tgz",
+      "integrity": "sha512-yE9YJIEAk2aZ+FL/G8r+UGw0CTUzEA8ZFy6E+8tc3spHUKq3qBnzCkI1CQwGoI9atJhVyFPEypQsTY7mJ1Pi9w==",
+      "funding": [
+        "https://opencollective.com/katex",
+        "https://github.com/sponsors/katex"
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "commander": "^8.3.0"
+      },
+      "bin": {
+        "katex": "cli.js"
+      }
+    },
     "node_modules/lilconfig": {
       "version": "2.1.0",
       "resolved": "https://registry.npmjs.org/lilconfig/-/lilconfig-2.1.0.tgz",
@@ -2021,6 +2113,381 @@
       "integrity": "sha512-AYnb1nQyY49te+VRAVgmzfcgjYS91mY5P0TKUDCLEM+gNnA+3T6rWITXRLYCpahpqSQbN5cE+gHpnPyXjHWxcw==",
       "license": "MIT"
     },
+    "node_modules/rxjs": {
+      "version": "7.8.1",
+      "resolved": "https://registry.npmjs.org/rxjs/-/rxjs-7.8.1.tgz",
+      "integrity": "sha512-AA3TVj+0A2iuIoQkWEK/tqFjBq2j+6PO6Y0zJcvzLAFhEFIO3HL0vls9hWLncZbAAbK0mar7oZ4V079I/qPMxg==",
+      "devOptional": true,
+      "license": "Apache-2.0",
+      "dependencies": {
+        "tslib": "^2.1.0"
+      }
+    },
+    "node_modules/sass-embedded": {
+      "version": "1.83.0",
+      "resolved": "https://registry.npmjs.org/sass-embedded/-/sass-embedded-1.83.0.tgz",
+      "integrity": "sha512-/8cYZeL39evUqe0o//193na51Q1VWZ61qhxioQvLJwOtWIrX+PgNhCyD8RSuTtmzc4+6+waFZf899bfp/MCUwA==",
+      "devOptional": true,
+      "license": "MIT",
+      "dependencies": {
+        "@bufbuild/protobuf": "^2.0.0",
+        "buffer-builder": "^0.2.0",
+        "colorjs.io": "^0.5.0",
+        "immutable": "^5.0.2",
+        "rxjs": "^7.4.0",
+        "supports-color": "^8.1.1",
+        "sync-child-process": "^1.0.2",
+        "varint": "^6.0.0"
+      },
+      "bin": {
+        "sass": "dist/bin/sass.js"
+      },
+      "engines": {
+        "node": ">=16.0.0"
+      },
+      "optionalDependencies": {
+        "sass-embedded-android-arm": "1.83.0",
+        "sass-embedded-android-arm64": "1.83.0",
+        "sass-embedded-android-ia32": "1.83.0",
+        "sass-embedded-android-riscv64": "1.83.0",
+        "sass-embedded-android-x64": "1.83.0",
+        "sass-embedded-darwin-arm64": "1.83.0",
+        "sass-embedded-darwin-x64": "1.83.0",
+        "sass-embedded-linux-arm": "1.83.0",
+        "sass-embedded-linux-arm64": "1.83.0",
+        "sass-embedded-linux-ia32": "1.83.0",
+        "sass-embedded-linux-musl-arm": "1.83.0",
+        "sass-embedded-linux-musl-arm64": "1.83.0",
+        "sass-embedded-linux-musl-ia32": "1.83.0",
+        "sass-embedded-linux-musl-riscv64": "1.83.0",
+        "sass-embedded-linux-musl-x64": "1.83.0",
+        "sass-embedded-linux-riscv64": "1.83.0",
+        "sass-embedded-linux-x64": "1.83.0",
+        "sass-embedded-win32-arm64": "1.83.0",
+        "sass-embedded-win32-ia32": "1.83.0",
+        "sass-embedded-win32-x64": "1.83.0"
+      }
+    },
+    "node_modules/sass-embedded-android-arm": {
+      "version": "1.83.0",
+      "resolved": "https://registry.npmjs.org/sass-embedded-android-arm/-/sass-embedded-android-arm-1.83.0.tgz",
+      "integrity": "sha512-uwFSXzJlfbd4Px189xE5l+cxN8+TQpXdQgJec7TIrb4HEY7imabtpYufpVdqUVwT1/uiis5V4+qIEC4Vl5XObQ==",
+      "cpu": [
+        "arm"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "android"
+      ],
+      "engines": {
+        "node": ">=14.0.0"
+      }
+    },
+    "node_modules/sass-embedded-android-arm64": {
+      "version": "1.83.0",
+      "resolved": "https://registry.npmjs.org/sass-embedded-android-arm64/-/sass-embedded-android-arm64-1.83.0.tgz",
+      "integrity": "sha512-GBiCvM4a2rkWBLdYDxI6XYnprfk5U5c81g69RC2X6kqPuzxzx8qTArQ9M6keFK4+iDQ5N9QTwFCr0KbZTn+ZNQ==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "android"
+      ],
+      "engines": {
+        "node": ">=14.0.0"
+      }
+    },
+    "node_modules/sass-embedded-android-ia32": {
+      "version": "1.83.0",
+      "resolved": "https://registry.npmjs.org/sass-embedded-android-ia32/-/sass-embedded-android-ia32-1.83.0.tgz",
+      "integrity": "sha512-5ATPdGo2SICqAhiJl/Z8KQ23zH4sGgobGgux0TnrNtt83uHZ+r+To/ubVJ7xTkZxed+KJZnIpolGD8dQyQqoTg==",
+      "cpu": [
+        "ia32"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "android"
+      ],
+      "engines": {
+        "node": ">=14.0.0"
+      }
+    },
+    "node_modules/sass-embedded-android-riscv64": {
+      "version": "1.83.0",
+      "resolved": "https://registry.npmjs.org/sass-embedded-android-riscv64/-/sass-embedded-android-riscv64-1.83.0.tgz",
+      "integrity": "sha512-aveknUOB8GZewOzVn2Uwk+DKcncTR50Q6vtzslNMGbYnxtgQNHzy8A1qVEviNUruex+pHofppeMK4iMPFAbiEQ==",
+      "cpu": [
+        "riscv64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "android"
+      ],
+      "engines": {
+        "node": ">=14.0.0"
+      }
+    },
+    "node_modules/sass-embedded-android-x64": {
+      "version": "1.83.0",
+      "resolved": "https://registry.npmjs.org/sass-embedded-android-x64/-/sass-embedded-android-x64-1.83.0.tgz",
+      "integrity": "sha512-WqIay/72ncyf9Ph4vS742J3a73wZihWmzFUwpn1OD6lme1Aj4eWzWIve5IVnlTEJgcZcDHu6ECID9IZgehJKoA==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "android"
+      ],
+      "engines": {
+        "node": ">=14.0.0"
+      }
+    },
+    "node_modules/sass-embedded-darwin-arm64": {
+      "version": "1.83.0",
+      "resolved": "https://registry.npmjs.org/sass-embedded-darwin-arm64/-/sass-embedded-darwin-arm64-1.83.0.tgz",
+      "integrity": "sha512-XQl9QqgxFFIPm/CzHhmppse5o9ocxrbaAdC2/DAnlAqvYWBBtgFqPjGoYlej13h9SzfvNoogx+y9r+Ap+e+hYg==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "darwin"
+      ],
+      "engines": {
+        "node": ">=14.0.0"
+      }
+    },
+    "node_modules/sass-embedded-darwin-x64": {
+      "version": "1.83.0",
+      "resolved": "https://registry.npmjs.org/sass-embedded-darwin-x64/-/sass-embedded-darwin-x64-1.83.0.tgz",
+      "integrity": "sha512-ERQ7Tvp1kFOW3ux4VDFIxb7tkYXHYc+zJpcrbs0hzcIO5ilIRU2tIOK1OrNwrFO6Qxyf7AUuBwYKLAtIU/Nz7g==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "darwin"
+      ],
+      "engines": {
+        "node": ">=14.0.0"
+      }
+    },
+    "node_modules/sass-embedded-linux-arm": {
+      "version": "1.83.0",
+      "resolved": "https://registry.npmjs.org/sass-embedded-linux-arm/-/sass-embedded-linux-arm-1.83.0.tgz",
+      "integrity": "sha512-baG9RYBJxUFmqwDNC9h9ZFElgJoyO3jgHGjzEZ1wHhIS9anpG+zZQvO8bHx3dBpKEImX+DBeLX+CxsFR9n81gQ==",
+      "cpu": [
+        "arm"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=14.0.0"
+      }
+    },
+    "node_modules/sass-embedded-linux-arm64": {
+      "version": "1.83.0",
+      "resolved": "https://registry.npmjs.org/sass-embedded-linux-arm64/-/sass-embedded-linux-arm64-1.83.0.tgz",
+      "integrity": "sha512-syEAVTJt4qhaMLxrSwOWa46zdqHJdnqJkLUK+t9aCr8xqBZLPxSUeIGji76uOehQZ1C+KGFj6n9xstHN6wzOJw==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=14.0.0"
+      }
+    },
+    "node_modules/sass-embedded-linux-ia32": {
+      "version": "1.83.0",
+      "resolved": "https://registry.npmjs.org/sass-embedded-linux-ia32/-/sass-embedded-linux-ia32-1.83.0.tgz",
+      "integrity": "sha512-RRBxQxMpoxu5+XcSSc6QR/o9asEwUzR8AbCS83RaXcdTIHTa/CccQsiAoDDoPlRsMTLqnzs0LKL4CfOsf7zBbA==",
+      "cpu": [
+        "ia32"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=14.0.0"
+      }
+    },
+    "node_modules/sass-embedded-linux-musl-arm": {
+      "version": "1.83.0",
+      "resolved": "https://registry.npmjs.org/sass-embedded-linux-musl-arm/-/sass-embedded-linux-musl-arm-1.83.0.tgz",
+      "integrity": "sha512-Yc7u2TelCfBab+PRob9/MNJFh3EooMiz4urvhejXkihTiKSHGCv5YqDdtWzvyb9tY2Jb7YtYREVuHwfdVn3dTQ==",
+      "cpu": [
+        "arm"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=14.0.0"
+      }
+    },
+    "node_modules/sass-embedded-linux-musl-arm64": {
+      "version": "1.83.0",
+      "resolved": "https://registry.npmjs.org/sass-embedded-linux-musl-arm64/-/sass-embedded-linux-musl-arm64-1.83.0.tgz",
+      "integrity": "sha512-Y7juhPHClUO2H5O+u+StRy6SEAcwZ+hTEk5WJdEmo1Bb1gDtfHvJaWB/iFZJ2tW0W1e865AZeUrC4OcOFjyAQA==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=14.0.0"
+      }
+    },
+    "node_modules/sass-embedded-linux-musl-ia32": {
+      "version": "1.83.0",
+      "resolved": "https://registry.npmjs.org/sass-embedded-linux-musl-ia32/-/sass-embedded-linux-musl-ia32-1.83.0.tgz",
+      "integrity": "sha512-arQeYwGmwXV8byx5G1PtSzZWW1jbkfR5qrIHMEbTFSAvAxpqjgSvCvrHMOFd73FcMxVaYh4BX9LQNbKinkbEdg==",
+      "cpu": [
+        "ia32"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=14.0.0"
+      }
+    },
+    "node_modules/sass-embedded-linux-musl-riscv64": {
+      "version": "1.83.0",
+      "resolved": "https://registry.npmjs.org/sass-embedded-linux-musl-riscv64/-/sass-embedded-linux-musl-riscv64-1.83.0.tgz",
+      "integrity": "sha512-E6uzlIWz59rut+Z3XR6mLG915zNzv07ISvj3GUNZENdHM7dF8GQ//ANoIpl5PljMQKp89GnYdvo6kj2gnaBf/g==",
+      "cpu": [
+        "riscv64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=14.0.0"
+      }
+    },
+    "node_modules/sass-embedded-linux-musl-x64": {
+      "version": "1.83.0",
+      "resolved": "https://registry.npmjs.org/sass-embedded-linux-musl-x64/-/sass-embedded-linux-musl-x64-1.83.0.tgz",
+      "integrity": "sha512-eAMK6tyGqvqr21r9g8BnR3fQc1rYFj85RGduSQ3xkITZ6jOAnOhuU94N5fwRS852Hpws0lXhET+7JHXgg3U18w==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=14.0.0"
+      }
+    },
+    "node_modules/sass-embedded-linux-riscv64": {
+      "version": "1.83.0",
+      "resolved": "https://registry.npmjs.org/sass-embedded-linux-riscv64/-/sass-embedded-linux-riscv64-1.83.0.tgz",
+      "integrity": "sha512-Ojpi78pTv02sy2fUYirRGXHLY3fPnV/bvwuC2i5LwPQw2LpCcFyFTtN0c5h4LJDk9P6wr+/ZB/JXU8tHIOlK+Q==",
+      "cpu": [
+        "riscv64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=14.0.0"
+      }
+    },
+    "node_modules/sass-embedded-linux-x64": {
+      "version": "1.83.0",
+      "resolved": "https://registry.npmjs.org/sass-embedded-linux-x64/-/sass-embedded-linux-x64-1.83.0.tgz",
+      "integrity": "sha512-3iLjlXdoPfgZRtX4odhRvka1BQs5mAXqfCtDIQBgh/o0JnGPzJIWWl9bYLpHxK8qb+uyVBxXYgXpI0sCzArBOw==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=14.0.0"
+      }
+    },
+    "node_modules/sass-embedded-win32-arm64": {
+      "version": "1.83.0",
+      "resolved": "https://registry.npmjs.org/sass-embedded-win32-arm64/-/sass-embedded-win32-arm64-1.83.0.tgz",
+      "integrity": "sha512-iOHw/8/t2dlTW3lOFwG5eUbiwhEyGWawivlKWJ8lkXH7fjMpVx2VO9zCFAm8RvY9xOHJ9sf1L7g5bx3EnNP9BQ==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "engines": {
+        "node": ">=14.0.0"
+      }
+    },
+    "node_modules/sass-embedded-win32-ia32": {
+      "version": "1.83.0",
+      "resolved": "https://registry.npmjs.org/sass-embedded-win32-ia32/-/sass-embedded-win32-ia32-1.83.0.tgz",
+      "integrity": "sha512-2PxNXJ8Pad4geVcTXY4rkyTr5AwbF8nfrCTDv0ulbTvPhzX2mMKEGcBZUXWn5BeHZTBc6whNMfS7d5fQXR9dDQ==",
+      "cpu": [
+        "ia32"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "engines": {
+        "node": ">=14.0.0"
+      }
+    },
+    "node_modules/sass-embedded-win32-x64": {
+      "version": "1.83.0",
+      "resolved": "https://registry.npmjs.org/sass-embedded-win32-x64/-/sass-embedded-win32-x64-1.83.0.tgz",
+      "integrity": "sha512-muBXkFngM6eLTNqOV0FQi7Dv9s+YRQ42Yem26mosdan/GmJQc81deto6uDTgrYn+bzFNmiXcOdfm+0MkTWK3OQ==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "engines": {
+        "node": ">=14.0.0"
+      }
+    },
     "node_modules/sucrase": {
       "version": "3.35.0",
       "resolved": "https://registry.npmjs.org/sucrase/-/sucrase-3.35.0.tgz",
@@ -2640,6 +3107,45 @@
         "node": ">=8"
       }
     },
+    "node_modules/supports-color": {
+      "version": "8.1.1",
+      "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-8.1.1.tgz",
+      "integrity": "sha512-MpUEN2OodtUzxvKQl72cUF7RQ5EiHsGvSsVG0ia9c5RbWGL2CI4C7EpPS8UTBIplnlzZiNuV56w+FuNxy3ty2Q==",
+      "devOptional": true,
+      "license": "MIT",
+      "dependencies": {
+        "has-flag": "^4.0.0"
+      },
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/supports-color?sponsor=1"
+      }
+    },
+    "node_modules/sync-child-process": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/sync-child-process/-/sync-child-process-1.0.2.tgz",
+      "integrity": "sha512-8lD+t2KrrScJ/7KXCSyfhT3/hRq78rC0wBFqNJXv3mZyn6hW2ypM05JmlSvtqRbeq6jqA94oHbxAr2vYsJ8vDA==",
+      "devOptional": true,
+      "license": "MIT",
+      "dependencies": {
+        "sync-message-port": "^1.0.0"
+      },
+      "engines": {
+        "node": ">=16.0.0"
+      }
+    },
+    "node_modules/sync-message-port": {
+      "version": "1.1.3",
+      "resolved": "https://registry.npmjs.org/sync-message-port/-/sync-message-port-1.1.3.tgz",
+      "integrity": "sha512-GTt8rSKje5FilG+wEdfCkOcLL7LWqpMlr2c3LRuKt/YXxcJ52aGSbGBAdI4L3aaqfrBt6y711El53ItyH1NWzg==",
+      "devOptional": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=16.0.0"
+      }
+    },
     "node_modules/tailwindcss": {
       "version": "3.4.15",
       "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-3.4.15.tgz",
@@ -2677,12 +3183,32 @@
         "node": ">=14.0.0"
       }
     },
+    "node_modules/textlinestream": {
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/textlinestream/-/textlinestream-1.1.1.tgz",
+      "integrity": "sha512-iBHbi7BQxrFmwZUQJsT0SjNzlLLsXhvW/kg7EyOMVMBIrlnj/qYofwo1LVLZi+3GbUEo96Iu2eqToI2+lZoAEQ==",
+      "license": "MIT"
+    },
+    "node_modules/tslib": {
+      "version": "2.8.1",
+      "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz",
+      "integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==",
+      "devOptional": true,
+      "license": "0BSD"
+    },
     "node_modules/uc.micro": {
       "version": "2.1.0",
       "resolved": "https://registry.npmjs.org/uc.micro/-/uc.micro-2.1.0.tgz",
       "integrity": "sha512-ARDJmphmdvUk6Glw7y9DQ2bFkKBHwQHLi2lsaH6PPmz/Ka9sFOBsBluozhDltWmnv9u/cF6Rt87znRTPV+yp/A==",
       "license": "MIT"
     },
+    "node_modules/varint": {
+      "version": "6.0.0",
+      "resolved": "https://registry.npmjs.org/varint/-/varint-6.0.0.tgz",
+      "integrity": "sha512-cXEIW6cfr15lFv563k4GuVuW/fiwjknytD37jIOLSdSWuOI6WnO/oKwmP2FQTU2l01LP8/M5TSAJpzUaGe3uWg==",
+      "devOptional": true,
+      "license": "MIT"
+    },
     "node_modules/vite": {
       "version": "5.4.11",
       "resolved": "https://registry.npmjs.org/vite/-/vite-5.4.11.tgz",
diff --git a/examples/server/webui/package.json b/examples/server/webui/package.json
index 2a45ece14be..2836cce00d4 100644
--- a/examples/server/webui/package.json
+++ b/examples/server/webui/package.json
@@ -6,17 +6,24 @@
   "scripts": {
     "dev": "vite",
     "build": "vite build",
-    "preview": "vite preview"
+    "preview": "vite preview",
+    "analyze": "ANALYZE=1 npx vite-bundle-visualizer"
   },
   "devDependencies": {
+    "sass-embedded": "^1.83.0",
     "vite": "^5.4.10"
   },
   "dependencies": {
+    "@sec-ant/readable-stream": "^0.6.0",
+    "@vscode/markdown-it-katex": "^1.1.1",
     "autoprefixer": "^10.4.20",
     "daisyui": "^4.12.14",
+    "highlight.js": "^11.10.0",
+    "katex": "^0.16.15",
     "markdown-it": "^14.1.0",
     "postcss": "^8.4.49",
     "tailwindcss": "^3.4.15",
+    "textlinestream": "^1.1.1",
     "vite-plugin-singlefile": "^2.0.3",
     "vue": "^3.5.13"
   }
diff --git a/examples/server/webui/public/demo-conversation.json b/examples/server/webui/public/demo-conversation.json
new file mode 100644
index 00000000000..75ab599dd6e
--- /dev/null
+++ b/examples/server/webui/public/demo-conversation.json
@@ -0,0 +1,33 @@
+{
+  "demo": true,
+  "id": "conv-1734086746930",
+  "lastModified": 1734087548943,
+  "messages": [
+    {
+      "id": 1734086764521,
+      "role": "user",
+      "content": "this is a demo conversation, used in dev mode"
+    },
+    {
+      "id": 1734087548327,
+      "role": "assistant",
+      "content": "This is the formula:\n\n$\\frac{e^{x_i}}{\\sum_{j=1}^{n}e^{x_j}}$\n\nGiven an input vector \\(\\mathbf{x} = [x_1, x_2, \\ldots, x_n]\\)\n\n\\[\ny_i = \\frac{e^{x_i}}{\\sum_{j=1}^n e^{x_j}}\n\\]\n\nCode block latex:\n```latex\n\\frac{e^{x_i}}{\\sum_{j=1}^{n}e^{x_j}}\n```\n\nTest dollar sign: $1234 $4567\n\nInvalid latex syntax: $E = mc^$ and $$E = mc^$$",
+      "timings": {
+        "prompt_n": 1,
+        "prompt_ms": 28.923,
+        "predicted_n": 25,
+        "predicted_ms": 573.016
+      }
+    },
+    {
+      "id": 1734087548328,
+      "role": "user",
+      "content": "this is a demo conversation, used in dev mode"
+    },
+    {
+      "id": 1734087548329,
+      "role": "assistant",
+      "content": "Code block:\n```js\nconsole.log('hello world')\n```\n```sh\nls -la /dev\n```"
+    }
+  ]
+}
diff --git a/examples/server/webui/src/completion.js b/examples/server/webui/src/completion.js
deleted file mode 100644
index 54a0f22f58e..00000000000
--- a/examples/server/webui/src/completion.js
+++ /dev/null
@@ -1,225 +0,0 @@
-const paramDefaults = {
-  stream: true,
-  temperature: 0.2,
-};
-
-let generation_settings = null;
-
-export class CompletionError extends Error {
-  constructor(message, name, data) {
-    super(message);
-    this.name = name;
-  }
-};
-
-// Completes the prompt as a generator. Recommended for most use cases.
-//
-// Example:
-//
-//    import { llama } from '/completion.js'
-//
-//    const request = llama("Tell me a joke", {n_predict: 800})
-//    for await (const chunk of request) {
-//      document.write(chunk.data.content)
-//    }
-//
-export async function* llama(prompt, params = {}, config = {}) {
-  let controller = config.controller;
-  const api_url = config.api_url?.replace(/\/+$/, '') || "";
-
-  if (!controller) {
-    controller = new AbortController();
-  }
-
-  const completionParams = { ...paramDefaults, ...params, prompt };
-
-  const response = await fetch(`${api_url}${config.endpoint || '/completion'}`, {
-    method: 'POST',
-    body: JSON.stringify(completionParams),
-    headers: {
-      'Connection': 'keep-alive',
-      'Content-Type': 'application/json',
-      'Accept': 'text/event-stream',
-      ...(params.api_key ? {'Authorization': `Bearer ${params.api_key}`} : {})
-    },
-    signal: controller.signal,
-  });
-
-  const status = response.status;
-  if (status !== 200) {
-    try {
-      const body = await response.json();
-      if (body && body.error && body.error.message) {
-        throw new CompletionError(body.error.message, 'ServerError');
-      }
-    } catch (err) {
-      throw new CompletionError(err.message, 'ServerError');
-    }
-  }
-
-  const reader = response.body.getReader();
-  const decoder = new TextDecoder();
-
-  let content = "";
-  let leftover = ""; // Buffer for partially read lines
-
-  try {
-    let cont = true;
-
-    while (cont) {
-      const result = await reader.read();
-      if (result.done) {
-        break;
-      }
-
-      // Add any leftover data to the current chunk of data
-      const text = leftover + decoder.decode(result.value);
-
-      // Check if the last character is a line break
-      const endsWithLineBreak = text.endsWith('\n');
-
-      // Split the text into lines
-      let lines = text.split('\n');
-
-      // If the text doesn't end with a line break, then the last line is incomplete
-      // Store it in leftover to be added to the next chunk of data
-      if (!endsWithLineBreak) {
-        leftover = lines.pop();
-      } else {
-        leftover = ""; // Reset leftover if we have a line break at the end
-      }
-
-      // Parse all sse events and add them to result
-      const regex = /^(\S+):\s(.*)$/gm;
-      for (const line of lines) {
-        const match = regex.exec(line);
-        if (match) {
-          result[match[1]] = match[2];
-          if (result.data === '[DONE]') {
-            cont = false;
-            break;
-          }
-
-          // since we know this is llama.cpp, let's just decode the json in data
-          if (result.data) {
-            result.data = JSON.parse(result.data);
-            content += result.data.content;
-
-            // yield
-            yield result;
-
-            // if we got a stop token from server, we will break here
-            if (result.data.stop) {
-              if (result.data.generation_settings) {
-                generation_settings = result.data.generation_settings;
-              }
-              cont = false;
-              break;
-            }
-          }
-          if (result.error) {
-            try {
-              result.error = JSON.parse(result.error);
-              if (result.error.message.includes('slot unavailable')) {
-                // Throw an error to be caught by upstream callers
-                throw new Error('slot unavailable');
-              } else {
-                console.error(`llama.cpp error [${result.error.code} - ${result.error.type}]: ${result.error.message}`);
-              }
-            } catch(e) {
-              console.error(`llama.cpp error ${result.error}`)
-            }
-          }
-        }
-      }
-    }
-  } catch (e) {
-    if (e.name !== 'AbortError') {
-      console.error("llama error: ", e);
-    }
-    throw e;
-  }
-  finally {
-    controller.abort();
-  }
-
-  return content;
-}
-
-// Call llama, return an event target that you can subscribe to
-//
-// Example:
-//
-//    import { llamaEventTarget } from '/completion.js'
-//
-//    const conn = llamaEventTarget(prompt)
-//    conn.addEventListener("message", (chunk) => {
-//      document.write(chunk.detail.content)
-//    })
-//
-export const llamaEventTarget = (prompt, params = {}, config = {}) => {
-  const eventTarget = new EventTarget();
-  (async () => {
-    let content = "";
-    for await (const chunk of llama(prompt, params, config)) {
-      if (chunk.data) {
-        content += chunk.data.content;
-        eventTarget.dispatchEvent(new CustomEvent("message", { detail: chunk.data }));
-      }
-      if (chunk.data.generation_settings) {
-        eventTarget.dispatchEvent(new CustomEvent("generation_settings", { detail: chunk.data.generation_settings }));
-      }
-      if (chunk.data.timings) {
-        eventTarget.dispatchEvent(new CustomEvent("timings", { detail: chunk.data.timings }));
-      }
-    }
-    eventTarget.dispatchEvent(new CustomEvent("done", { detail: { content } }));
-  })();
-  return eventTarget;
-}
-
-// Call llama, return a promise that resolves to the completed text. This does not support streaming
-//
-// Example:
-//
-//     llamaPromise(prompt).then((content) => {
-//       document.write(content)
-//     })
-//
-//     or
-//
-//     const content = await llamaPromise(prompt)
-//     document.write(content)
-//
-export const llamaPromise = (prompt, params = {}, config = {}) => {
-  return new Promise(async (resolve, reject) => {
-    let content = "";
-    try {
-      for await (const chunk of llama(prompt, params, config)) {
-        content += chunk.data.content;
-      }
-      resolve(content);
-    } catch (error) {
-      reject(error);
-    }
-  });
-};
-
-/**
- * (deprecated)
- */
-export const llamaComplete = async (params, controller, callback) => {
-  for await (const chunk of llama(params.prompt, params, { controller })) {
-    callback(chunk);
-  }
-}
-
-// Get the model info from the server. This is useful for getting the context window and so on.
-export const llamaModelInfo = async (config = {}) => {
-  if (!generation_settings) {
-    const api_url = config.api_url?.replace(/\/+$/, '') || "";
-    const props = await fetch(`${api_url}/props`).then(r => r.json());
-    generation_settings = props.default_generation_settings;
-  }
-  return generation_settings;
-}
diff --git a/examples/server/webui/src/highlight-config.js b/examples/server/webui/src/highlight-config.js
new file mode 100644
index 00000000000..96c7028f98b
--- /dev/null
+++ b/examples/server/webui/src/highlight-config.js
@@ -0,0 +1,60 @@
+import hljs from 'highlight.js/lib/core';
+
+// only import commonly used languages to reduce bundle size
+
+import python from 'highlight.js/lib/languages/python';
+import javascript from 'highlight.js/lib/languages/javascript';
+import json from 'highlight.js/lib/languages/json';
+import bash from 'highlight.js/lib/languages/bash';
+import yaml from 'highlight.js/lib/languages/yaml';
+import markdown from 'highlight.js/lib/languages/markdown';
+import scss from 'highlight.js/lib/languages/scss';
+import xml from 'highlight.js/lib/languages/xml';
+import ruby from 'highlight.js/lib/languages/ruby';
+import go from 'highlight.js/lib/languages/go';
+import java from 'highlight.js/lib/languages/java';
+import rust from 'highlight.js/lib/languages/rust';
+import scala from 'highlight.js/lib/languages/scala';
+import cpp from 'highlight.js/lib/languages/cpp';
+import csharp from 'highlight.js/lib/languages/csharp';
+import swift from 'highlight.js/lib/languages/swift';
+import dart from 'highlight.js/lib/languages/dart';
+import elixir from 'highlight.js/lib/languages/elixir';
+import kotlin from 'highlight.js/lib/languages/kotlin';
+import lua from 'highlight.js/lib/languages/lua';
+import php from 'highlight.js/lib/languages/php';
+import latex from 'highlight.js/lib/languages/latex';
+
+hljs.registerLanguage('python', python);
+hljs.registerLanguage('javascript', javascript);
+hljs.registerLanguage('json', json);
+hljs.registerLanguage('yaml', yaml);
+hljs.registerLanguage('markdown', markdown);
+hljs.registerLanguage('xml', xml);
+hljs.registerLanguage('ruby', ruby);
+hljs.registerLanguage('go', go);
+hljs.registerLanguage('java', java);
+hljs.registerLanguage('rust', rust);
+hljs.registerLanguage('scala', scala);
+hljs.registerLanguage('csharp', csharp);
+hljs.registerLanguage('swift', swift);
+hljs.registerLanguage('dart', dart);
+hljs.registerLanguage('elixir', elixir);
+hljs.registerLanguage('kotlin', kotlin);
+hljs.registerLanguage('lua', lua);
+hljs.registerLanguage('php', php);
+hljs.registerLanguage('latex', latex);
+
+// reuse some languages to further reduce bundle size
+
+hljs.registerLanguage('shell', bash);
+hljs.registerLanguage('bash', bash);
+hljs.registerLanguage('sh', bash);
+
+hljs.registerLanguage('css', scss);
+hljs.registerLanguage('scss', scss);
+
+hljs.registerLanguage('c', cpp);
+hljs.registerLanguage('cpp', cpp);
+
+export default hljs;
diff --git a/examples/server/webui/src/katex-gpt.js b/examples/server/webui/src/katex-gpt.js
new file mode 100644
index 00000000000..7c7c5e22c1c
--- /dev/null
+++ b/examples/server/webui/src/katex-gpt.js
@@ -0,0 +1,66 @@
+import katex from 'katex';
+
+// Adapted from https://github.com/SchneeHertz/markdown-it-katex-gpt
+// MIT license
+
+const defaultOptions = {
+  delimiters: [
+    { left: '\\[', right: '\\]', display: true },
+    { left: '\\(', right: '\\)', display: false },
+  ],
+};
+
+export function renderLatexHTML(content, display = false) {
+  return katex.renderToString(content, {
+    throwOnError: false,
+    output: 'mathml',
+    displayMode: display,
+  });
+}
+
+function escapedBracketRule(options) {
+  return (state, silent) => {
+    const max = state.posMax;
+    const start = state.pos;
+
+    for (const { left, right, display } of options.delimiters) {
+
+      // Check if it starts with the left delimiter
+      if (!state.src.slice(start).startsWith(left)) continue;
+
+      // Skip the length of the left delimiter
+      let pos = start + left.length;
+
+      // Find the matching right delimiter
+      while (pos < max) {
+        if (state.src.slice(pos).startsWith(right)) {
+          break;
+        }
+        pos++;
+      }
+
+      // No matching right delimiter found, skip to the next match
+      if (pos >= max) continue;
+
+      // If not in silent mode, convert LaTeX formula to MathML
+      if (!silent) {
+        const content = state.src.slice(start + left.length, pos);
+        try {
+          const renderedContent = renderLatexHTML(content, display);
+          const token = state.push('html_inline', '', 0);
+          token.content = renderedContent;
+        } catch (e) {
+          console.error(e);
+        }
+      }
+
+      // Update position, skip the length of the right delimiter
+      state.pos = pos + right.length;
+      return true;
+    }
+  }
+}
+
+export default function (md, options = defaultOptions) {
+  md.inline.ruler.after('text', 'escaped_bracket', escapedBracketRule(options));
+}
diff --git a/examples/server/webui/src/main.js b/examples/server/webui/src/main.js
index 9b5b1232933..358a40628a3 100644
--- a/examples/server/webui/src/main.js
+++ b/examples/server/webui/src/main.js
@@ -1,23 +1,57 @@
-import './styles.css';
+import './styles.scss';
 import { createApp, defineComponent, shallowRef, computed, h } from 'vue/dist/vue.esm-bundler.js';
-import { llama } from './completion.js';
 import MarkdownIt from 'markdown-it';
+import TextLineStream from 'textlinestream';
+
+// math formula rendering
+import 'katex/dist/katex.min.css';
+import markdownItKatexGpt from './katex-gpt';
+import markdownItKatexNormal from '@vscode/markdown-it-katex';
+
+// code highlighting
+import hljs from './highlight-config';
+import daisyuiThemes from 'daisyui/src/theming/themes';
+
+// ponyfill for missing ReadableStream asyncIterator on Safari
+import { asyncIterator } from '@sec-ant/readable-stream/ponyfill/asyncIterator';
+
+const isDev = import.meta.env.MODE === 'development';
 
 // utility functions
 const isString = (x) => !!x.toLowerCase;
-const isNumeric = (n) => !isString(n) && !isNaN(n);
+const isBoolean = (x) => x === true || x === false;
+const isNumeric = (n) => !isString(n) && !isNaN(n) && !isBoolean(n);
 const escapeAttr = (str) => str.replace(/>/g, '&gt;').replace(/"/g, '&quot;');
-const copyStr = (str) => navigator.clipboard.writeText(str);
+const copyStr = (textToCopy) => {
+  // Navigator clipboard api needs a secure context (https)
+  if (navigator.clipboard && window.isSecureContext) {
+    navigator.clipboard.writeText(textToCopy);
+  } else {
+    // Use the 'out of viewport hidden text area' trick
+    const textArea = document.createElement('textarea');
+    textArea.value = textToCopy;
+    // Move textarea out of the viewport so it's not visible
+    textArea.style.position = 'absolute';
+    textArea.style.left = '-999999px';
+    document.body.prepend(textArea);
+    textArea.select();
+    document.execCommand('copy');
+  }
+};
 
 // constants
-const BASE_URL = localStorage.getItem('base') // for debugging
-  || (new URL('.', document.baseURI).href).toString(); // for production
+const BASE_URL = isDev
+  ? (localStorage.getItem('base') || 'https://localhost:8080') // for debugging
+  : (new URL('.', document.baseURI).href).toString().replace(/\/$/, ''); // for production
+console.log({ BASE_URL });
+
 const CONFIG_DEFAULT = {
   // Note: in order not to introduce breaking changes, please keep the same data type (number, string, etc) if you want to change the default value. Do not use null or undefined for default value.
   apiKey: '',
   systemMessage: 'You are a helpful assistant.',
+  showTokensPerSecond: false,
   // make sure these default values are in sync with `common.h`
-  samplers: 'dkypmxt',
+  samplers: 'edkypmxt',
   temperature: 0.8,
   dynatemp_range: 0.0,
   dynatemp_exponent: 1.0,
@@ -65,12 +99,39 @@ const CONFIG_INFO = {
 // config keys having numeric value (i.e. temperature, top_k, top_p, etc)
 const CONFIG_NUMERIC_KEYS = Object.entries(CONFIG_DEFAULT).filter(e => isNumeric(e[1])).map(e => e[0]);
 // list of themes supported by daisyui
-const THEMES = ['light', 'dark', 'cupcake', 'bumblebee', 'emerald', 'corporate', 'synthwave', 'retro', 'cyberpunk', 'valentine', 'halloween', 'garden', 'forest', 'aqua', 'lofi', 'pastel', 'fantasy', 'wireframe', 'black', 'luxury', 'dracula', 'cmyk', 'autumn', 'business', 'acid', 'lemonade', 'night', 'coffee', 'winter', 'dim', 'nord', 'sunset'];
+const THEMES = ['light', 'dark']
+  // make sure light & dark are always at the beginning
+  .concat(Object.keys(daisyuiThemes).filter(t => t !== 'light' && t !== 'dark'));
 
 // markdown support
 const VueMarkdown = defineComponent(
   (props) => {
-    const md = shallowRef(new MarkdownIt({ breaks: true }));
+    const md = shallowRef(new MarkdownIt({
+      breaks: true,
+      highlight: function (str, lang) { // Add highlight.js
+        if (lang && hljs.getLanguage(lang)) {
+          try {
+            return '<pre><code class="hljs">' +
+                   hljs.highlight(str, { language: lang, ignoreIllegals: true }).value +
+                   '</code></pre>';
+          } catch (__) {}
+        }
+        return '<pre><code class="hljs">' + md.value.utils.escapeHtml(str) + '</code></pre>';
+      }
+    }));
+    // support latex with double dollar sign and square brackets
+    md.value.use(markdownItKatexGpt, {
+      delimiters: [
+        { left: '\\[', right: '\\]', display: true },
+        { left: '\\(', right: '\\)', display: false },
+        { left: '$$', right: '$$', display: false },
+        // do not add single dollar sign here, other wise it will confused with dollar used for money symbol
+      ],
+      throwOnError: false,
+    });
+    // support latex with single dollar sign
+    md.value.use(markdownItKatexNormal, { throwOnError: false });
+    // add copy button to code blocks
     const origFenchRenderer = md.value.renderer.rules.fence;
     md.value.renderer.rules.fence = (tokens, idx, ...args) => {
       const content = tokens[idx].content;
@@ -84,9 +145,9 @@ const VueMarkdown = defineComponent(
     };
     window.copyStr = copyStr;
     const content = computed(() => md.value.render(props.source));
-    return () => h("div", { innerHTML: content.value });
+    return () => h('div', { innerHTML: content.value });
   },
-  { props: ["source"] }
+  { props: ['source'] }
 );
 
 // input field to be used by settings modal
@@ -101,6 +162,48 @@ const SettingsModalShortInput = defineComponent({
   },
 });
 
+// message bubble component
+const MessageBubble = defineComponent({
+  components: {
+    VueMarkdown
+  },
+  template: document.getElementById('message-bubble').innerHTML,
+  props: {
+    config: Object,
+    msg: Object,
+    isGenerating: Boolean,
+    editUserMsgAndRegenerate: Function,
+    regenerateMsg: Function,
+  },
+  data() {
+    return {
+      editingContent: null,
+    };
+  },
+  computed: {
+    timings() {
+      if (!this.msg.timings) return null;
+      return {
+        ...this.msg.timings,
+        prompt_per_second: this.msg.timings.prompt_n / (this.msg.timings.prompt_ms / 1000),
+        predicted_per_second: this.msg.timings.predicted_n / (this.msg.timings.predicted_ms / 1000),
+      };
+    }
+  },
+  methods: {
+    copyMsg() {
+      copyStr(this.msg.content);
+    },
+    editMsg() {
+      this.editUserMsgAndRegenerate({
+        ...this.msg,
+        content: this.editingContent,
+      });
+      this.editingContent = null;
+    },
+  },
+});
+
 // coversations is stored in localStorage
 // format: { [convId]: { id: string, lastModified: number, messages: [...] } }
 // convId is a string prefixed with 'conv-'
@@ -192,10 +295,29 @@ const chatScrollToBottom = (requiresNearBottom) => {
   }
 };
 
+// wrapper for SSE
+async function* sendSSEPostRequest(url, fetchOptions) {
+  const res = await fetch(url, fetchOptions);
+  const lines = res.body
+    .pipeThrough(new TextDecoderStream())
+    .pipeThrough(new TextLineStream());
+  for await (const line of asyncIterator(lines)) {
+    if (isDev) console.log({line});
+    if (line.startsWith('data:') && !line.endsWith('[DONE]')) {
+      const data = JSON.parse(line.slice(5));
+      yield data;
+    } else if (line.startsWith('error:')) {
+      const data = JSON.parse(line.slice(6));
+      throw new Error(data.message || 'Unknown error');
+    }
+  }
+};
+
 const mainApp = createApp({
   components: {
     VueMarkdown,
     SettingsModalShortInput,
+    MessageBubble,
   },
   data() {
     return {
@@ -209,11 +331,11 @@ const mainApp = createApp({
       selectedTheme: StorageUtils.getTheme(),
       config: StorageUtils.getConfig(),
       showConfigDialog: false,
-      editingMsg: null,
       // const
       themes: THEMES,
       configDefault: {...CONFIG_DEFAULT},
       configInfo: {...CONFIG_INFO},
+      isDev,
     }
   },
   computed: {},
@@ -225,6 +347,16 @@ const mainApp = createApp({
       if (this.isGenerating) chatScrollToBottom(true);
     });
     resizeObserver.observe(pendingMsgElem);
+    this.setSelectedTheme(this.selectedTheme);
+  },
+  watch: {
+    viewingConvId: function(val, oldVal) {
+      if (val != oldVal) {
+        this.fetchMessages();
+        chatScrollToBottom();
+        this.hideSidebar();
+      }
+    }
   },
   methods: {
     hideSidebar() {
@@ -232,23 +364,17 @@ const mainApp = createApp({
     },
     setSelectedTheme(theme) {
       this.selectedTheme = theme;
+      document.body.setAttribute('data-theme', theme);
+      document.body.setAttribute('data-color-scheme', daisyuiThemes[theme]?.['color-scheme'] ?? 'auto');
       StorageUtils.setTheme(theme);
     },
     newConversation() {
       if (this.isGenerating) return;
       this.viewingConvId = StorageUtils.getNewConvId();
-      this.editingMsg = null;
-      this.fetchMessages();
-      chatScrollToBottom();
-      this.hideSidebar();
     },
     setViewingConv(convId) {
       if (this.isGenerating) return;
       this.viewingConvId = convId;
-      this.editingMsg = null;
-      this.fetchMessages();
-      chatScrollToBottom();
-      this.hideSidebar();
     },
     deleteConv(convId) {
       if (this.isGenerating) return;
@@ -256,7 +382,6 @@ const mainApp = createApp({
         StorageUtils.remove(convId);
         if (this.viewingConvId === convId) {
           this.viewingConvId = StorageUtils.getNewConvId();
-          this.editingMsg = null;
         }
         this.fetchConversation();
         this.fetchMessages();
@@ -291,7 +416,6 @@ const mainApp = createApp({
       this.fetchConversation();
       this.fetchMessages();
       this.inputMsg = '';
-      this.editingMsg = null;
       this.generateMessage(currConvId);
       chatScrollToBottom();
     },
@@ -299,7 +423,6 @@ const mainApp = createApp({
       if (this.isGenerating) return;
       this.pendingMsg = { id: Date.now()+1, role: 'assistant', content: null };
       this.isGenerating = true;
-      this.editingMsg = null;
 
       try {
         const abortController = new AbortController();
@@ -330,17 +453,21 @@ const mainApp = createApp({
           dry_allowed_length: this.config.dry_allowed_length,
           dry_penalty_last_n: this.config.dry_penalty_last_n,
           max_tokens: this.config.max_tokens,
+          timings_per_token: !!this.config.showTokensPerSecond,
           ...(this.config.custom.length ? JSON.parse(this.config.custom) : {}),
-          ...(this.config.apiKey ? { api_key: this.config.apiKey } : {}),
         };
-        const config = {
-          controller: abortController,
-          api_url: BASE_URL,
-          endpoint: '/chat/completions',
-        };
-        for await (const chunk of llama(prompt, params, config)) {
-          const stop = chunk.data.stop;
-          const addedContent = chunk.data.choices[0].delta.content;
+        const chunks = sendSSEPostRequest(`${BASE_URL}/v1/chat/completions`, {
+          method: 'POST',
+          headers: {
+            'Content-Type': 'application/json',
+            ...(this.config.apiKey ? {'Authorization': `Bearer ${this.config.apiKey}`} : {})
+          },
+          body: JSON.stringify(params),
+          signal: abortController.signal,
+        });
+        for await (const chunk of chunks) {
+          const stop = chunk.stop;
+          const addedContent = chunk.choices[0].delta.content;
           const lastContent = this.pendingMsg.content || '';
           if (addedContent) {
             this.pendingMsg = {
@@ -349,6 +476,16 @@ const mainApp = createApp({
               content: lastContent + addedContent,
             };
           }
+          const timings = chunk.timings;
+          if (timings && this.config.showTokensPerSecond) {
+            // only extract what's really needed, to save some space
+            this.pendingMsg.timings = {
+              prompt_n: timings.prompt_n,
+              prompt_ms: timings.prompt_ms,
+              predicted_n: timings.predicted_n,
+              predicted_ms: timings.predicted_ms,
+            };
+          }
         }
 
         StorageUtils.appendMsg(currConvId, this.pendingMsg);
@@ -387,14 +524,10 @@ const mainApp = createApp({
       this.fetchMessages();
       this.generateMessage(currConvId);
     },
-    copyMsg(msg) {
-      copyStr(msg.content);
-    },
     editUserMsgAndRegenerate(msg) {
       if (this.isGenerating) return;
       const currConvId = this.viewingConvId;
       const newContent = msg.content;
-      this.editingMsg = null;
       StorageUtils.filterAndKeepMsgs(currConvId, (m) => m.id < msg.id);
       StorageUtils.appendMsg(currConvId, {
         id: Date.now(),
@@ -441,6 +574,17 @@ const mainApp = createApp({
     fetchMessages() {
       this.messages = StorageUtils.getOneConversation(this.viewingConvId)?.messages ?? [];
     },
+
+    // debug functions
+    async debugImportDemoConv() {
+      const res = await fetch('/demo-conversation.json');
+      const demoConv = await res.json();
+      StorageUtils.remove(demoConv.id);
+      for (const msg of demoConv.messages) {
+        StorageUtils.appendMsg(demoConv.id, msg);
+      }
+      this.fetchConversation();
+    }
   },
 });
 mainApp.config.errorHandler = alert;
diff --git a/examples/server/webui/src/styles.css b/examples/server/webui/src/styles.css
deleted file mode 100644
index 67d35b99e49..00000000000
--- a/examples/server/webui/src/styles.css
+++ /dev/null
@@ -1,26 +0,0 @@
-@tailwind base;
-@tailwind components;
-@tailwind utilities;
-
-.markdown {
-  h1, h2, h3, h4, h5, h6, ul, ol, li { all: revert; }
-  pre {
-    @apply whitespace-pre-wrap rounded-lg p-2;
-    border: 1px solid currentColor;
-  }
-  /* TODO: fix markdown table */
-}
-
-.show-on-hover {
-  @apply md:opacity-0 md:group-hover:opacity-100;
-}
-.btn-mini {
-  @apply cursor-pointer hover:shadow-md;
-}
-.chat-screen { max-width: 900px; }
-
-.chat-bubble-base-300 {
-  --tw-bg-opacity: 1;
-  --tw-text-opacity: 1;
-  @apply bg-base-300 text-base-content;
-}
diff --git a/examples/server/webui/src/styles.scss b/examples/server/webui/src/styles.scss
new file mode 100644
index 00000000000..34fe2aaf01c
--- /dev/null
+++ b/examples/server/webui/src/styles.scss
@@ -0,0 +1,48 @@
+@use "sass:meta";
+
+@tailwind base;
+@tailwind components;
+@tailwind utilities;
+
+.markdown {
+  h1, h2, h3, h4, h5, h6, ul, ol, li { all: revert; }
+  pre {
+    @apply whitespace-pre-wrap rounded-lg p-2;
+    border: 1px solid currentColor;
+  }
+  /* TODO: fix markdown table */
+}
+
+.show-on-hover {
+  @apply md:opacity-0 md:group-hover:opacity-100;
+}
+.btn-mini {
+  @apply cursor-pointer hover:shadow-md;
+}
+.chat-screen { max-width: 900px; }
+
+.chat-bubble-base-300 {
+  --tw-bg-opacity: 1;
+  --tw-text-opacity: 1;
+  @apply bg-base-300 text-base-content;
+}
+
+/* Highlight.js */
+[data-color-scheme='light'] {
+  @include meta.load-css('highlight.js/styles/stackoverflow-light');
+}
+[data-color-scheme='dark'] {
+  @include meta.load-css('highlight.js/styles/stackoverflow-dark');
+}
+[data-color-scheme='auto'] {
+  @media (prefers-color-scheme: light) {
+    @include meta.load-css('highlight.js/styles/stackoverflow-light');
+  }
+  @media (prefers-color-scheme: dark) {
+    @include meta.load-css('highlight.js/styles/stackoverflow-dark');
+  }
+}
+.hljs {
+  background: transparent !important;
+  padding: 0.5em !important;
+}
diff --git a/examples/server/webui/vite.config.js b/examples/server/webui/vite.config.js
index 789bf9cbbf1..6619a630d47 100644
--- a/examples/server/webui/vite.config.js
+++ b/examples/server/webui/vite.config.js
@@ -2,6 +2,9 @@
 import { viteSingleFile } from 'vite-plugin-singlefile';
 import path from 'path';
 import fs from 'fs';
+import zlib from 'zlib';
+
+const MAX_BUNDLE_SIZE = 1.5 * 1024 * 1024; // only increase when absolutely necessary
 
 const GUIDE_FOR_FRONTEND = `
 <!--
@@ -12,25 +15,45 @@ const GUIDE_FOR_FRONTEND = `
 -->
 `.trim();
 
-export default {
-  plugins: [
-    viteSingleFile(),
-    (function llamaCppPlugin() {
-      let config;
-      return {
-        name: 'llamacpp:build',
-        apply: 'build',
-        async configResolved(_config) {
-          config = _config;
-        },
-        writeBundle() {
-          const outputIndexHtml = path.join(config.build.outDir, 'index.html');
-          const content = fs.readFileSync(outputIndexHtml, 'utf-8');
-
-          const targetOutputFile = path.join(config.build.outDir, '../../public/index.html');
-          fs.writeFileSync(targetOutputFile, GUIDE_FOR_FRONTEND + '\n' + content);
+const BUILD_PLUGINS = [
+  viteSingleFile(),
+  (function llamaCppPlugin() {
+    let config;
+    return {
+      name: 'llamacpp:build',
+      apply: 'build',
+      async configResolved(_config) {
+        config = _config;
+      },
+      writeBundle() {
+        const outputIndexHtml = path.join(config.build.outDir, 'index.html');
+        const content = GUIDE_FOR_FRONTEND + '\n' + fs.readFileSync(outputIndexHtml, 'utf-8');
+        const compressed = zlib.gzipSync(Buffer.from(content, 'utf-8'), { level: 9 });
+
+        // because gzip header contains machine-specific info, we must remove these data from the header
+        // timestamp
+        compressed[0x4] = 0;
+        compressed[0x5] = 0;
+        compressed[0x6] = 0;
+        compressed[0x7] = 0;
+        // OS
+        compressed[0x9] = 0;
+
+        if (compressed.byteLength > MAX_BUNDLE_SIZE) {
+          throw new Error(
+            `Bundle size is too large (${Math.ceil(compressed.byteLength / 1024)} KB).\n` +
+            `Please reduce the size of the frontend or increase MAX_BUNDLE_SIZE in vite.config.js.\n`,
+          );
         }
+
+        const targetOutputFile = path.join(config.build.outDir, '../../public/index.html.gz');
+        fs.writeFileSync(targetOutputFile, compressed);
       }
-    })(),
-  ],
+    }
+  })(),
+];
+
+/** @type {import('vite').UserConfig} */
+export default {
+  plugins: process.env.ANALYZE ? [] : BUILD_PLUGINS,
 };
diff --git a/examples/tokenize/tokenize.cpp b/examples/tokenize/tokenize.cpp
index 12ad5425651..c97e227242c 100644
--- a/examples/tokenize/tokenize.cpp
+++ b/examples/tokenize/tokenize.cpp
@@ -394,7 +394,7 @@ int main(int raw_argc, char ** raw_argv) {
     }
 
     if (show_token_count) {
-        printf("Total number of tokens: %ld\n", tokens.size());
+        printf("Total number of tokens: %zu\n", tokens.size());
     }
     // silence valgrind
     llama_free(ctx);
diff --git a/examples/tts/CMakeLists.txt b/examples/tts/CMakeLists.txt
new file mode 100644
index 00000000000..c72bd814c3b
--- /dev/null
+++ b/examples/tts/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET llama-tts)
+add_executable(${TARGET} tts.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/tts/convert_pt_to_hf.py b/examples/tts/convert_pt_to_hf.py
new file mode 100644
index 00000000000..8909a65fd1e
--- /dev/null
+++ b/examples/tts/convert_pt_to_hf.py
@@ -0,0 +1,180 @@
+# convert the https://huggingface.co/novateur/WavTokenizer-large-speech-75token to HF format
+# the goal is to be able to reuse the convert_hf_to_gguf.py after that to create a GGUF file with the WavTokenizer decoder
+#
+# TODO: this script is LLM-generated and probably very inefficient and should be rewritten
+
+import torch
+import json
+import os
+import sys
+import re
+
+from safetensors.torch import save_file
+
+# default
+model_path = './model.pt';
+
+# read from CLI
+if len(sys.argv) > 1:
+    model_path = sys.argv[1]
+
+# get the directory of the input model
+path_dst = os.path.dirname(model_path)
+
+print(f"Loading model from {model_path}")
+
+model = torch.load(model_path, map_location='cpu')
+
+#print(model)
+
+# print all keys
+for key in model.keys():
+    print(key)
+    if key == 'hyper_parameters':
+        #print(model[key])
+        # dump as json pretty
+        print(json.dumps(model[key], indent=4))
+    #if key != 'state_dict' and key != 'optimizer_states':
+    #    print(model[key])
+
+# Check if the loaded model is a state_dict or a model instance
+if isinstance(model, torch.nn.Module):
+    state_dict = model.state_dict()
+else:
+    state_dict = model
+
+# Print the structure of the state_dict to understand its format
+print("State dictionary keys:")
+for key in state_dict.keys():
+    print(key)
+
+# Ensure the state_dict is flat and contains only torch.Tensor objects
+def flatten_state_dict(state_dict, parent_key='', sep='.'):
+    items = []
+    items_new = []
+
+    for k, v in state_dict.items():
+        new_key = f"{parent_key}{sep}{k}" if parent_key else k
+        if isinstance(v, torch.Tensor):
+            items.append((new_key, v))
+        elif isinstance(v, dict):
+            items.extend(flatten_state_dict(v, new_key, sep=sep).items())
+            return dict(items)
+
+    size_total_mb = 0
+
+    for key, value in list(items):
+        # keep only what we need for inference
+        if not key.startswith('state_dict.feature_extractor.encodec.quantizer.') and \
+           not key.startswith('state_dict.backbone.') and \
+           not key.startswith('state_dict.head.out'):
+               print('Skipping key: ', key)
+               continue
+
+        new_key = key
+
+        new_key = new_key.replace('state_dict.', '')
+        new_key = new_key.replace('pos_net', 'posnet')
+
+        # check if matches "backbone.posnet.%d.bias" or "backbone.posnet.%d.weight"
+        if new_key.startswith("backbone.posnet."):
+            match = re.match(r"backbone\.posnet\.(\d+)\.(bias|weight)", new_key)
+            if match:
+               new_key = f"backbone.posnet.{match.group(1)}.norm.{match.group(2)}"
+
+        # "feature_extractor.encodec.quantizer.vq.layers.0._codebook.embed" -> "backbone.embedding.weight"
+        if new_key == "feature_extractor.encodec.quantizer.vq.layers.0._codebook.embed":
+            new_key = "backbone.embedding.weight"
+
+        # these are the only rows used
+        # ref: https://github.com/edwko/OuteTTS/blob/a613e79c489d8256dd657ea9168d78de75895d82/outetts/wav_tokenizer/audio_codec.py#L100
+        if new_key.endswith("norm.scale.weight"):
+            new_key = new_key.replace("norm.scale.weight", "norm.weight")
+            value = value[0]
+
+        if new_key.endswith("norm.shift.weight"):
+            new_key = new_key.replace("norm.shift.weight", "norm.bias")
+            value = value[0]
+
+        if new_key.endswith("gamma"):
+            new_key = new_key.replace("gamma", "gamma.weight")
+
+        # convert from 1D [768] to 2D [768, 1] so that ggml_add can broadcast the bias
+        if (new_key.endswith("norm.weight") or new_key.endswith("norm1.weight") or new_key.endswith("norm2.weight") or new_key.endswith(".bias")) and (new_key.startswith("backbone.posnet") or new_key.startswith("backbone.embed.bias")):
+            value = value.unsqueeze(1)
+
+        if new_key.endswith("dwconv.bias"):
+            value = value.unsqueeze(1)
+
+        size_mb = value.element_size() * value.nelement() / (1024 * 1024)
+        print(f"{size_mb:8.2f} MB - {new_key}: {value.shape}")
+
+        size_total_mb += size_mb
+
+        #print(key, '->', new_key, ': ', value)
+        #print(key, '->', new_key)
+
+        items_new.append((new_key, value))
+
+    print(f"Total size: {size_total_mb:8.2f} MB")
+
+    return dict(items_new)
+
+flattened_state_dict = flatten_state_dict(state_dict)
+
+
+# Convert the model to the safetensors format
+output_path = path_dst + '/model.safetensors'
+save_file(flattened_state_dict, output_path)
+
+print(f"Model has been successfully converted and saved to {output_path}")
+
+# Calculate the total size of the .safetensors file
+total_size = os.path.getsize(output_path)
+
+# Create the weight map
+weight_map = {
+    "model.safetensors": ["*"]  # Assuming all weights are in one file
+}
+
+# Create metadata for the index.json file
+metadata = {
+    "total_size": total_size,
+    "weight_map": weight_map
+}
+
+# Save the metadata to index.json
+index_path = path_dst + '/index.json'
+with open(index_path, 'w') as f:
+    json.dump(metadata, f, indent=4)
+
+print(f"Metadata has been saved to {index_path}")
+
+config = {
+    "architectures": [
+        "WavTokenizerDec"
+    ],
+    "hidden_size": 1282,
+    "n_embd_features": 512,
+    "n_ff": 2304,
+    "vocab_size": 4096,
+    "n_head": 1,
+    "layer_norm_epsilon": 1e-6,
+    "group_norm_epsilon": 1e-6,
+    "group_norm_groups": 32,
+    "max_position_embeddings": 8192, # ?
+    "n_layer": 12,
+    "posnet": {
+        "n_embd": 768,
+        "n_layer": 6
+    },
+    "convnext": {
+        "n_embd": 768,
+        "n_layer": 12
+    },
+}
+
+with open(path_dst + '/config.json', 'w') as f:
+    json.dump(config, f, indent=4)
+
+print(f"Config has been saved to {path_dst + 'config.json'}")
diff --git a/examples/tts/tts-outetts.py b/examples/tts/tts-outetts.py
new file mode 100644
index 00000000000..0f81192fca5
--- /dev/null
+++ b/examples/tts/tts-outetts.py
@@ -0,0 +1,175 @@
+import sys
+#import json
+#import struct
+import requests
+import re
+
+def process_text(text: str):
+    text = re.sub(r'\d+(\.\d+)?', lambda x: x.group(), text.lower()) # TODO this needs to be fixed
+    text = re.sub(r'[-_/,\.\\]', ' ', text)
+    text = re.sub(r'[^a-z\s]', '', text)
+    text = re.sub(r'\s+', ' ', text).strip()
+    return text.split()
+
+# usage:
+# python tts-outetts.py http://server-llm:port http://server-dec:port "text"
+
+if len(sys.argv) <= 3:
+    print("usage: python tts-outetts.py http://server-llm:port http://server-dec:port \"text\"")
+    exit(1)
+
+host_llm = sys.argv[1]
+host_dec = sys.argv[2]
+text = sys.argv[3]
+
+prefix = """<|im_start|>
+<|text_start|>the<|text_sep|>overall<|text_sep|>package<|text_sep|>from<|text_sep|>just<|text_sep|>two<|text_sep|>people<|text_sep|>is<|text_sep|>pretty<|text_sep|>remarkable<|text_sep|>sure<|text_sep|>i<|text_sep|>have<|text_sep|>some<|text_sep|>critiques<|text_sep|>about<|text_sep|>some<|text_sep|>of<|text_sep|>the<|text_sep|>gameplay<|text_sep|>aspects<|text_sep|>but<|text_sep|>its<|text_sep|>still<|text_sep|>really<|text_sep|>enjoyable<|text_sep|>and<|text_sep|>it<|text_sep|>looks<|text_sep|>lovely<|text_sep|>"""
+
+words = process_text(text)
+words = "<|text_sep|>".join([i.strip() for i in words])
+words += "<|text_end|>\n"
+
+# voice data
+# TODO: load from json
+#suffix = """<|audio_start|>
+#the<|t_0.08|><|code_start|><|257|><|740|><|636|><|913|><|788|><|1703|><|code_end|>
+#overall<|t_0.36|><|code_start|><|127|><|201|><|191|><|774|><|700|><|532|><|1056|><|557|><|798|><|298|><|1741|><|747|><|1662|><|1617|><|1702|><|1527|><|368|><|1588|><|1049|><|1008|><|1625|><|747|><|1576|><|728|><|1019|><|1696|><|1765|><|code_end|>
+#package<|t_0.56|><|code_start|><|935|><|584|><|1319|><|627|><|1016|><|1491|><|1344|><|1117|><|1526|><|1040|><|239|><|1435|><|951|><|498|><|723|><|1180|><|535|><|789|><|1649|><|1637|><|78|><|465|><|1668|><|901|><|595|><|1675|><|117|><|1009|><|1667|><|320|><|840|><|79|><|507|><|1762|><|1508|><|1228|><|1768|><|802|><|1450|><|1457|><|232|><|639|><|code_end|>
+#from<|t_0.19|><|code_start|><|604|><|782|><|1682|><|872|><|1532|><|1600|><|1036|><|1761|><|647|><|1554|><|1371|><|653|><|1595|><|950|><|code_end|>
+#just<|t_0.25|><|code_start|><|1782|><|1670|><|317|><|786|><|1748|><|631|><|599|><|1155|><|1364|><|1524|><|36|><|1591|><|889|><|1535|><|541|><|440|><|1532|><|50|><|870|><|code_end|>
+#two<|t_0.24|><|code_start|><|1681|><|1510|><|673|><|799|><|805|><|1342|><|330|><|519|><|62|><|640|><|1138|><|565|><|1552|><|1497|><|1552|><|572|><|1715|><|1732|><|code_end|>
+#people<|t_0.39|><|code_start|><|593|><|274|><|136|><|740|><|691|><|633|><|1484|><|1061|><|1138|><|1485|><|344|><|428|><|397|><|1562|><|645|><|917|><|1035|><|1449|><|1669|><|487|><|442|><|1484|><|1329|><|1832|><|1704|><|600|><|761|><|653|><|269|><|code_end|>
+#is<|t_0.16|><|code_start|><|566|><|583|><|1755|><|646|><|1337|><|709|><|802|><|1008|><|485|><|1583|><|652|><|10|><|code_end|>
+#pretty<|t_0.32|><|code_start|><|1818|><|1747|><|692|><|733|><|1010|><|534|><|406|><|1697|><|1053|><|1521|><|1355|><|1274|><|816|><|1398|><|211|><|1218|><|817|><|1472|><|1703|><|686|><|13|><|822|><|445|><|1068|><|code_end|>
+#remarkable<|t_0.68|><|code_start|><|230|><|1048|><|1705|><|355|><|706|><|1149|><|1535|><|1787|><|1356|><|1396|><|835|><|1583|><|486|><|1249|><|286|><|937|><|1076|><|1150|><|614|><|42|><|1058|><|705|><|681|><|798|><|934|><|490|><|514|><|1399|><|572|><|1446|><|1703|><|1346|><|1040|><|1426|><|1304|><|664|><|171|><|1530|><|625|><|64|><|1708|><|1830|><|1030|><|443|><|1509|><|1063|><|1605|><|1785|><|721|><|1440|><|923|><|code_end|>
+#sure<|t_0.36|><|code_start|><|792|><|1780|><|923|><|1640|><|265|><|261|><|1525|><|567|><|1491|><|1250|><|1730|><|362|><|919|><|1766|><|543|><|1|><|333|><|113|><|970|><|252|><|1606|><|133|><|302|><|1810|><|1046|><|1190|><|1675|><|code_end|>
+#i<|t_0.08|><|code_start|><|123|><|439|><|1074|><|705|><|1799|><|637|><|code_end|>
+#have<|t_0.16|><|code_start|><|1509|><|599|><|518|><|1170|><|552|><|1029|><|1267|><|864|><|419|><|143|><|1061|><|0|><|code_end|>
+#some<|t_0.16|><|code_start|><|619|><|400|><|1270|><|62|><|1370|><|1832|><|917|><|1661|><|167|><|269|><|1366|><|1508|><|code_end|>
+#critiques<|t_0.60|><|code_start|><|559|><|584|><|1163|><|1129|><|1313|><|1728|><|721|><|1146|><|1093|><|577|><|928|><|27|><|630|><|1080|><|1346|><|1337|><|320|><|1382|><|1175|><|1682|><|1556|><|990|><|1683|><|860|><|1721|><|110|><|786|><|376|><|1085|><|756|><|1523|><|234|><|1334|><|1506|><|1578|><|659|><|612|><|1108|><|1466|><|1647|><|308|><|1470|><|746|><|556|><|1061|><|code_end|>
+#about<|t_0.29|><|code_start|><|26|><|1649|><|545|><|1367|><|1263|><|1728|><|450|><|859|><|1434|><|497|><|1220|><|1285|><|179|><|755|><|1154|><|779|><|179|><|1229|><|1213|><|922|><|1774|><|1408|><|code_end|>
+#some<|t_0.23|><|code_start|><|986|><|28|><|1649|><|778|><|858|><|1519|><|1|><|18|><|26|><|1042|><|1174|><|1309|><|1499|><|1712|><|1692|><|1516|><|1574|><|code_end|>
+#of<|t_0.07|><|code_start|><|197|><|716|><|1039|><|1662|><|64|><|code_end|>
+#the<|t_0.08|><|code_start|><|1811|><|1568|><|569|><|886|><|1025|><|1374|><|code_end|>
+#gameplay<|t_0.48|><|code_start|><|1269|><|1092|><|933|><|1362|><|1762|><|1700|><|1675|><|215|><|781|><|1086|><|461|><|838|><|1022|><|759|><|649|><|1416|><|1004|><|551|><|909|><|787|><|343|><|830|><|1391|><|1040|><|1622|><|1779|><|1360|><|1231|><|1187|><|1317|><|76|><|997|><|989|><|978|><|737|><|189|><|code_end|>
+#aspects<|t_0.56|><|code_start|><|1423|><|797|><|1316|><|1222|><|147|><|719|><|1347|><|386|><|1390|><|1558|><|154|><|440|><|634|><|592|><|1097|><|1718|><|712|><|763|><|1118|><|1721|><|1311|><|868|><|580|><|362|><|1435|><|868|><|247|><|221|><|886|><|1145|><|1274|><|1284|><|457|><|1043|><|1459|><|1818|><|62|><|599|><|1035|><|62|><|1649|><|778|><|code_end|>
+#but<|t_0.20|><|code_start|><|780|><|1825|><|1681|><|1007|><|861|><|710|><|702|><|939|><|1669|><|1491|><|613|><|1739|><|823|><|1469|><|648|><|code_end|>
+#its<|t_0.09|><|code_start|><|92|><|688|><|1623|><|962|><|1670|><|527|><|599|><|code_end|>
+#still<|t_0.27|><|code_start|><|636|><|10|><|1217|><|344|><|713|><|957|><|823|><|154|><|1649|><|1286|><|508|><|214|><|1760|><|1250|><|456|><|1352|><|1368|><|921|><|615|><|5|><|code_end|>
+#really<|t_0.36|><|code_start|><|55|><|420|><|1008|><|1659|><|27|><|644|><|1266|><|617|><|761|><|1712|><|109|><|1465|><|1587|><|503|><|1541|><|619|><|197|><|1019|><|817|><|269|><|377|><|362|><|1381|><|507|><|1488|><|4|><|1695|><|code_end|>
+#enjoyable<|t_0.49|><|code_start|><|678|><|501|><|864|><|319|><|288|><|1472|><|1341|><|686|><|562|><|1463|><|619|><|1563|><|471|><|911|><|730|><|1811|><|1006|><|520|><|861|><|1274|><|125|><|1431|><|638|><|621|><|153|><|876|><|1770|><|437|><|987|><|1653|><|1109|><|898|><|1285|><|80|><|593|><|1709|><|843|><|code_end|>
+#and<|t_0.15|><|code_start|><|1285|><|987|><|303|><|1037|><|730|><|1164|><|502|><|120|><|1737|><|1655|><|1318|><|code_end|>
+#it<|t_0.09|><|code_start|><|848|><|1366|><|395|><|1601|><|1513|><|593|><|1302|><|code_end|>
+#looks<|t_0.27|><|code_start|><|1281|><|1266|><|1755|><|572|><|248|><|1751|><|1257|><|695|><|1380|><|457|><|659|><|585|><|1315|><|1105|><|1776|><|736|><|24|><|736|><|654|><|1027|><|code_end|>
+#lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|1481|><|1721|><|1123|><|438|><|1246|><|1251|><|795|><|659|><|1381|><|1658|><|217|><|1772|><|562|><|952|><|107|><|1129|><|1112|><|467|><|550|><|1079|><|840|><|1615|><|1469|><|1380|><|168|><|917|><|836|><|1827|><|437|><|583|><|67|><|595|><|1087|><|1646|><|1493|><|1677|><|code_end|>"""
+
+# TODO: tokenization is slow for some reason - here is pre-tokenized input
+suffix = [ 151667, 198, 1782, 155780, 151669, 151929, 152412, 152308, 152585, 152460, 153375, 151670, 198, 74455,
+          155808, 151669, 151799, 151873, 151863, 152446, 152372, 152204, 152728, 152229, 152470, 151970, 153413,
+          152419, 153334, 153289, 153374, 153199, 152040, 153260, 152721, 152680, 153297, 152419, 153248, 152400,
+          152691, 153368, 153437, 151670, 198, 1722, 155828, 151669, 152607, 152256, 152991, 152299, 152688, 153163,
+          153016, 152789, 153198, 152712, 151911, 153107, 152623, 152170, 152395, 152852, 152207, 152461, 153321,
+          153309, 151750, 152137, 153340, 152573, 152267, 153347, 151789, 152681, 153339, 151992, 152512, 151751,
+          152179, 153434, 153180, 152900, 153440, 152474, 153122, 153129, 151904, 152311, 151670, 198, 1499, 155791,
+          151669, 152276, 152454, 153354, 152544, 153204, 153272, 152708, 153433, 152319, 153226, 153043, 152325,
+          153267, 152622, 151670, 198, 4250, 155797, 151669, 153454, 153342, 151989, 152458, 153420, 152303, 152271,
+          152827, 153036, 153196, 151708, 153263, 152561, 153207, 152213, 152112, 153204, 151722, 152542, 151670, 198,
+          19789, 155796, 151669, 153353, 153182, 152345, 152471, 152477, 153014, 152002, 152191, 151734, 152312, 152810,
+          152237, 153224, 153169, 153224, 152244, 153387, 153404, 151670, 198, 16069, 155811, 151669, 152265, 151946,
+          151808, 152412, 152363, 152305, 153156, 152733, 152810, 153157, 152016, 152100, 152069, 153234, 152317,
+          152589, 152707, 153121, 153341, 152159, 152114, 153156, 153001, 153504, 153376, 152272, 152433, 152325,
+          151941, 151670, 198, 285, 155788, 151669, 152238, 152255, 153427, 152318, 153009, 152381, 152474, 152680,
+          152157, 153255, 152324, 151682, 151670, 198, 32955, 155804, 151669, 153490, 153419, 152364, 152405, 152682,
+          152206, 152078, 153369, 152725, 153193, 153027, 152946, 152488, 153070, 151883, 152890, 152489, 153144,
+          153375, 152358, 151685, 152494, 152117, 152740, 151670, 198, 37448, 480, 155840, 151669, 151902, 152720,
+          153377, 152027, 152378, 152821, 153207, 153459, 153028, 153068, 152507, 153255, 152158, 152921, 151958,
+          152609, 152748, 152822, 152286, 151714, 152730, 152377, 152353, 152470, 152606, 152162, 152186, 153071,
+          152244, 153118, 153375, 153018, 152712, 153098, 152976, 152336, 151843, 153202, 152297, 151736, 153380,
+          153502, 152702, 152115, 153181, 152735, 153277, 153457, 152393, 153112, 152595, 151670, 198, 19098, 155808,
+          151669, 152464, 153452, 152595, 153312, 151937, 151933, 153197, 152239, 153163, 152922, 153402, 152034,
+          152591, 153438, 152215, 151673, 152005, 151785, 152642, 151924, 153278, 151805, 151974, 153482, 152718,
+          152862, 153347, 151670, 198, 72, 155780, 151669, 151795, 152111, 152746, 152377, 153471, 152309, 151670, 198,
+          19016, 155788, 151669, 153181, 152271, 152190, 152842, 152224, 152701, 152939, 152536, 152091, 151815, 152733,
+          151672, 151670, 198, 14689, 155788, 151669, 152291, 152072, 152942, 151734, 153042, 153504, 152589, 153333,
+          151839, 151941, 153038, 153180, 151670, 198, 36996, 8303, 155832, 151669, 152231, 152256, 152835, 152801,
+          152985, 153400, 152393, 152818, 152765, 152249, 152600, 151699, 152302, 152752, 153018, 153009, 151992,
+          153054, 152847, 153354, 153228, 152662, 153355, 152532, 153393, 151782, 152458, 152048, 152757, 152428,
+          153195, 151906, 153006, 153178, 153250, 152331, 152284, 152780, 153138, 153319, 151980, 153142, 152418,
+          152228, 152733, 151670, 198, 9096, 155801, 151669, 151698, 153321, 152217, 153039, 152935, 153400, 152122,
+          152531, 153106, 152169, 152892, 152957, 151851, 152427, 152826, 152451, 151851, 152901, 152885, 152594,
+          153446, 153080, 151670, 198, 14689, 155795, 151669, 152658, 151700, 153321, 152450, 152530, 153191, 151673,
+          151690, 151698, 152714, 152846, 152981, 153171, 153384, 153364, 153188, 153246, 151670, 198, 1055, 155779,
+          151669, 151869, 152388, 152711, 153334, 151736, 151670, 198, 1782, 155780, 151669, 153483, 153240, 152241,
+          152558, 152697, 153046, 151670, 198, 5804, 1363, 155820, 151669, 152941, 152764, 152605, 153034, 153434,
+          153372, 153347, 151887, 152453, 152758, 152133, 152510, 152694, 152431, 152321, 153088, 152676, 152223,
+          152581, 152459, 152015, 152502, 153063, 152712, 153294, 153451, 153032, 152903, 152859, 152989, 151748,
+          152669, 152661, 152650, 152409, 151861, 151670, 198, 300, 7973, 155828, 151669, 153095, 152469, 152988,
+          152894, 151819, 152391, 153019, 152058, 153062, 153230, 151826, 152112, 152306, 152264, 152769, 153390,
+          152384, 152435, 152790, 153393, 152983, 152540, 152252, 152034, 153107, 152540, 151919, 151893, 152558,
+          152817, 152946, 152956, 152129, 152715, 153131, 153490, 151734, 152271, 152707, 151734, 153321, 152450,
+          151670, 198, 8088, 155792, 151669, 152452, 153497, 153353, 152679, 152533, 152382, 152374, 152611, 153341,
+          153163, 152285, 153411, 152495, 153141, 152320, 151670, 198, 1199, 155781, 151669, 151764, 152360, 153295,
+          152634, 153342, 152199, 152271, 151670, 198, 43366, 155799, 151669, 152308, 151682, 152889, 152016, 152385,
+          152629, 152495, 151826, 153321, 152958, 152180, 151886, 153432, 152922, 152128, 153024, 153040, 152593,
+          152287, 151677, 151670, 198, 53660, 155808, 151669, 151727, 152092, 152680, 153331, 151699, 152316, 152938,
+          152289, 152433, 153384, 151781, 153137, 153259, 152175, 153213, 152291, 151869, 152691, 152489, 151941,
+          152049, 152034, 153053, 152179, 153160, 151676, 153367, 151670, 198, 268, 4123, 480, 155821, 151669, 152350,
+          152173, 152536, 151991, 151960, 153144, 153013, 152358, 152234, 153135, 152291, 153235, 152143, 152583,
+          152402, 153483, 152678, 152192, 152533, 152946, 151797, 153103, 152310, 152293, 151825, 152548, 153442,
+          152109, 152659, 153325, 152781, 152570, 152957, 151752, 152265, 153381, 152515, 151670, 198, 437, 155787,
+          151669, 152957, 152659, 151975, 152709, 152402, 152836, 152174, 151792, 153409, 153327, 152990, 151670, 198,
+          275, 155781, 151669, 152520, 153038, 152067, 153273, 153185, 152265, 152974, 151670, 198, 94273, 155799,
+          151669, 152953, 152938, 153427, 152244, 151920, 153423, 152929, 152367, 153052, 152129, 152331, 152257,
+          152987, 152777, 153448, 152408, 151696, 152408, 152326, 152699, 151670, 198, 385, 16239, 155828, 151669,
+          152306, 152268, 153438, 153228, 152978, 152957, 153153, 153393, 152795, 152110, 152918, 152923, 152467,
+          152331, 153053, 153330, 151889, 153444, 152234, 152624, 151779, 152801, 152784, 152139, 152222, 152751,
+          152512, 153287, 153141, 153052, 151840, 152589, 152508, 153499, 152109, 152255, 151739, 152267, 152759,
+          153318, 153165, 153349, 151670, ]
+
+response = requests.post(
+    host_llm + "/completion",
+    json={
+        "prompt": [prefix + words, *suffix],
+        "n_predict": 1024,
+        "cache_prompt": True,
+        "return_tokens": True,
+        "samplers": ["top_k"],
+        "top_k": 16,
+        "seed": 1003,
+    }
+)
+
+response_json = response.json()
+
+#print(json.dumps(response_json, indent=4))
+#print(json.dumps(response_json["prompt"], indent=4).replace("\\n", "\n"))
+#print(json.dumps(response_json["timings"], indent=4))
+#print(json.dumps(response_json["tokens"], indent=4))
+
+codes = response_json["tokens"]
+
+codes = [t - 151672 for t in codes if t >= 151672 and t <= 155772]
+
+response = requests.post(
+    host_dec + "/embeddings",
+    json={
+        "input": [*codes],
+    }
+)
+
+response_json = response.json()
+
+#print(json.dumps(response_json, indent=4))
+
+# spectrogram
+embd = response_json[0]["embedding"]
+
+n_codes = len(embd)
+n_embd = len(embd[0])
+
+print('spectrogram generated: n_codes: %d, n_embd: %d' % (n_codes, n_embd))
+
+# post-process the spectrogram to convert to audio
+# TODO: see the tts.cpp:embd_to_audio() and implement it in Python
+print('converting to audio ...')
+print('TODO: see the tts.cpp:embd_to_audio() and implement it in Python')
diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp
new file mode 100644
index 00000000000..7f36b80f0de
--- /dev/null
+++ b/examples/tts/tts.cpp
@@ -0,0 +1,932 @@
+#include "arg.h"
+#include "common.h"
+#include "sampling.h"
+#include "log.h"
+#include "llama.h"
+
+#define _USE_MATH_DEFINES // For M_PI on MSVC
+
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <fstream>
+#include <map>
+#include <regex>
+#include <string>
+#include <thread>
+#include <vector>
+
+//
+// Terminal utils
+//
+
+#define SQR(X)    ((X) * (X))
+#define UNCUBE(x) x < 48 ? 0 : x < 115 ? 1 : (x - 35) / 40
+
+/**
+ * Quantizes 24-bit RGB to xterm256 code range [16,256).
+ */
+static int rgb2xterm256(int r, int g, int b) {
+    unsigned char cube[] = {0, 0137, 0207, 0257, 0327, 0377};
+    int av, ir, ig, ib, il, qr, qg, qb, ql;
+    av = r * .299 + g * .587 + b * .114 + .5;
+    ql = (il = av > 238 ? 23 : (av - 3) / 10) * 10 + 8;
+    qr = cube[(ir = UNCUBE(r))];
+    qg = cube[(ig = UNCUBE(g))];
+    qb = cube[(ib = UNCUBE(b))];
+    if (SQR(qr - r) + SQR(qg - g) + SQR(qb - b) <=
+        SQR(ql - r) + SQR(ql - g) + SQR(ql - b))
+        return ir * 36 + ig * 6 + ib + 020;
+    return il + 0350;
+}
+
+static std::string set_xterm256_foreground(int r, int g, int b) {
+    int x = rgb2xterm256(r, g, b);
+    std::ostringstream oss;
+    oss << "\033[38;5;" << x << "m";
+    return oss.str();
+}
+
+const std::vector<std::string> k_colors = {
+    set_xterm256_foreground(220,   5,  12),
+    set_xterm256_foreground(232,  96,  28),
+    set_xterm256_foreground(241, 147,  45),
+    set_xterm256_foreground(246, 193,  65),
+    set_xterm256_foreground(247, 240,  86),
+    set_xterm256_foreground(144, 201, 135),
+    set_xterm256_foreground( 78, 178, 101),
+};
+
+static void print_usage(int, char ** argv) {
+    LOG("\nexample usage:\n");
+    LOG("\n    %s -m model.gguf -p \"Hello!\"\n", argv[0]);
+    LOG("\n");
+}
+
+struct wav_header {
+    char riff[4] = {'R', 'I', 'F', 'F'};
+    uint32_t chunk_size;
+    char wave[4] = {'W', 'A', 'V', 'E'};
+    char fmt[4] = {'f', 'm', 't', ' '};
+    uint32_t fmt_chunk_size = 16;
+    uint16_t audio_format = 1; // PCM
+    uint16_t num_channels = 1; // Mono
+    uint32_t sample_rate;
+    uint32_t byte_rate;
+    uint16_t block_align;
+    uint16_t bits_per_sample = 16;
+    char data[4] = {'d', 'a', 't', 'a'};
+    uint32_t data_size;
+};
+
+static void save_wav16(const std::string & fname, const std::vector<float> & data, int sample_rate) {
+    std::ofstream file(fname, std::ios::binary);
+    if (!file) {
+        LOG_ERR("%s: Failed to open file '%s' for writing", __func__, fname.c_str());
+        return;
+    }
+
+    wav_header header;
+    header.sample_rate = sample_rate;
+    header.byte_rate = header.sample_rate * header.num_channels * (header.bits_per_sample / 8);
+    header.block_align = header.num_channels * (header.bits_per_sample / 8);
+    header.data_size = data.size() * (header.bits_per_sample / 8);
+    header.chunk_size = 36 + header.data_size;
+
+    file.write(reinterpret_cast<const char*>(&header), sizeof(header));
+
+    for (const auto & sample : data) {
+        int16_t pcm_sample = static_cast<int16_t>(std::clamp(sample * 32767.0, -32768.0, 32767.0));
+        file.write(reinterpret_cast<const char*>(&pcm_sample), sizeof(pcm_sample));
+    }
+
+    file.close();
+}
+
+static void fill_hann_window(int length, bool periodic, float * output) {
+    int offset = -1;
+    if (periodic) {
+        offset = 0;
+    }
+    for (int i = 0; i < length; i++) {
+        output[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
+    }
+}
+
+// very poor-man fft
+static void twiddle(float * real, float * imag, int k, int N) {
+    float angle = 2 * M_PI * k / N;
+    *real = cos(angle);
+    *imag = sin(angle);
+}
+
+static void irfft(int n, const float * inp_cplx, float * out_real) {
+    int N = n / 2 + 1;
+
+    std::vector<float> real_input(N);
+    std::vector<float> imag_input(N);
+    for (int i = 0; i < N; ++i) {
+        real_input[i] = inp_cplx[2 * i];
+        imag_input[i] = inp_cplx[2 * i + 1];
+    }
+
+    std::vector<float> real_output(n);
+    std::vector<float> imag_output(n);
+
+    for (int k = 0; k < n; ++k) {
+        real_output[k] = 0.0f;
+        imag_output[k] = 0.0f;
+        for (int m = 0; m < N; ++m) {
+            float twiddle_real;
+            float twiddle_imag;
+
+            twiddle(&twiddle_real, &twiddle_imag, k * m, n);
+
+            real_output[k] += real_input[m] * twiddle_real - imag_input[m] * twiddle_imag;
+            imag_output[k] += real_input[m] * twiddle_imag + imag_input[m] * twiddle_real;
+        }
+    }
+
+    for (int i = 0; i < n; ++i) {
+        out_real[i] = real_output[i] / N;
+    }
+}
+
+//
+//  y = torch.nn.functional.fold(
+//       data, output_size=(1, output_size), kernel_size=(1, self.win_length), stride=(1, self.hop_length),
+//  )[:, 0, 0, pad:-pad]
+//
+// data.shape =  torch.Size([1, 1280, 261])
+// output_size =  84480
+// win_length =  1280
+// hop_length =  320
+// pad =  480
+//
+static void fold(const std::vector<float> & data, int64_t n_out, int64_t n_win, int64_t n_hop, int64_t n_pad, std::vector<float> & output) {
+    int64_t output_height = n_out;
+    int64_t kernel_w = n_win;
+    int64_t stride_w = n_hop;
+    int64_t width    = n_out;
+
+    output.resize(width, 0.0f);
+
+    int64_t col_idx = 0;
+    for (int64_t w_col = 0; w_col < width; ++w_col) {
+        int64_t start = w_col * stride_w - n_pad;
+        int64_t end   = start + kernel_w;
+
+        for (int64_t w_im = start; w_im < end; ++w_im) {
+            if (w_im >= 0 && w_im < output_height && col_idx < (int64_t) data.size()) {
+                output[w_im] += data[col_idx];
+            }
+            col_idx++;
+        }
+    }
+
+    output.resize(n_out - 2 * n_pad);
+}
+
+// TODO: not optimized at all
+static std::vector<float> embd_to_audio(
+        const float * embd,
+        const int n_codes,
+        const int n_embd,
+        const int n_thread) {
+    const int n_fft = 1280;
+    const int n_hop = 320;
+    const int n_win = 1280;
+    const int n_pad = (n_win - n_hop)/2;
+    const int n_out = (n_codes - 1)*n_hop + n_win;
+
+    std::vector<float> hann(n_fft);
+
+    fill_hann_window(hann.size(), true, hann.data());
+
+    int n_spec = n_embd*n_codes;
+
+    std::vector<float> E (n_spec);
+    std::vector<float> S (n_spec);
+    std::vector<float> ST(n_spec);
+
+    for (int l = 0; l < n_codes; ++l) {
+        for (int k = 0; k < n_embd; ++k) {
+            E[k*n_codes + l] = embd[l*n_embd + k];
+        }
+    }
+
+    for (int k = 0; k < n_embd/2; ++k) {
+        for (int l = 0; l < n_codes; ++l) {
+            float mag = E[(k           )*n_codes + l];
+            float phi = E[(k + n_embd/2)*n_codes + l];
+
+            mag = exp(mag);
+
+            if (mag > 1e2) {
+                mag = 1e2;
+            }
+            S[2*(k*n_codes + l) + 0] = mag*cosf(phi);
+            S[2*(k*n_codes + l) + 1] = mag*sinf(phi);
+        }
+    }
+
+    for (int l = 0; l < n_codes; ++l) {
+        for (int k = 0; k < n_embd/2; ++k) {
+            ST[l*n_embd + 2*k + 0] = S[2*(k*n_codes + l) + 0];
+            ST[l*n_embd + 2*k + 1] = S[2*(k*n_codes + l) + 1];
+        }
+    }
+
+    std::vector<float> res  (n_codes*n_fft);
+    std::vector<float> hann2(n_codes*n_fft);
+
+    std::vector<std::thread> workers(n_thread);
+    for (int i = 0; i < n_thread; ++i) {
+        workers[i] = std::thread([&, i]() {
+            for (int l = i; l < n_codes; l += n_thread) {
+                irfft(n_fft, ST.data() + l*n_embd, res.data() + l*n_fft);
+                for (int j = 0; j < n_fft; ++j) {
+                    res  [l*n_fft + j] *= hann[j];
+                    hann2[l*n_fft + j]  = hann[j] * hann[j];
+                }
+            }
+        });
+    }
+    for (int i = 0; i < n_thread; ++i) {
+        workers[i].join();
+    }
+
+    std::vector<float> audio;
+    std::vector<float> env;
+
+    fold(res,   n_out, n_win, n_hop, n_pad, audio);
+    fold(hann2, n_out, n_win, n_hop, n_pad, env); // TODO: can be done once
+
+    for (size_t i = 0; i < audio.size(); ++i) {
+        audio[i] /= env[i];
+    }
+
+    return audio;
+}
+
+static const std::map<int, std::string> ones = {
+    {0, "zero"}, {1, "one"}, {2, "two"}, {3, "three"}, {4, "four"},
+    {5, "five"}, {6, "six"}, {7, "seven"}, {8, "eight"}, {9, "nine"},
+    {10, "ten"}, {11, "eleven"}, {12, "twelve"}, {13, "thirteen"}, {14, "fourteen"},
+    {15, "fifteen"}, {16, "sixteen"}, {17, "seventeen"}, {18, "eighteen"}, {19, "nineteen"}
+};
+
+static const std::map<int, std::string> tens = {
+    {2, "twenty"}, {3, "thirty"}, {4, "forty"}, {5, "fifty"},
+    {6, "sixty"}, {7, "seventy"}, {8, "eighty"}, {9, "ninety"}
+};
+
+// Convert a number less than 1000 to words
+static std::string convert_less_than_thousand(int num) {
+    std::string result;
+
+    if (num >= 100) {
+        result += ones.at(num / 100) + " hundred ";
+        num %= 100;
+    }
+
+    if (num >= 20) {
+        result += tens.at(num / 10);
+        if (num % 10 > 0) {
+            result += "-" + ones.at(num % 10);
+        }
+    } else if (num > 0) {
+        result += ones.at(num);
+    }
+
+    return result;
+}
+
+static std::string number_to_words(const std::string & number_str) {
+    try {
+        size_t decimal_pos = number_str.find('.');
+        std::string integer_part = number_str.substr(0, decimal_pos);
+
+        int int_number = std::stoi(integer_part);
+        std::string result;
+
+        if (int_number == 0) {
+            result = "zero";
+        } else {
+            if (int_number >= 1000000000) {
+                int billions = int_number / 1000000000;
+                result += convert_less_than_thousand(billions) + " billion ";
+                int_number %= 1000000000;
+            }
+
+            if (int_number >= 1000000) {
+                int millions = int_number / 1000000;
+                result += convert_less_than_thousand(millions) + " million ";
+                int_number %= 1000000;
+            }
+
+            if (int_number >= 1000) {
+                int thousands = int_number / 1000;
+                result += convert_less_than_thousand(thousands) + " thousand ";
+                int_number %= 1000;
+            }
+
+            if (int_number > 0) {
+                result += convert_less_than_thousand(int_number);
+            }
+        }
+
+        // Handle decimal part
+        if (decimal_pos != std::string::npos) {
+            result += " point";
+            std::string decimal_part = number_str.substr(decimal_pos + 1);
+            for (char digit : decimal_part) {
+                result += " " + ones.at(digit - '0');
+            }
+        }
+
+        return result;
+    } catch (const std::exception& e) {
+        // Skip if fails
+        return " ";
+    }
+}
+
+static std::string replace_numbers_with_words(const std::string & input_text) {
+    std::regex number_pattern(R"(\d+(\.\d+)?)");
+    std::string result;
+    auto it = std::sregex_iterator(input_text.begin(), input_text.end(), number_pattern);
+    auto end = std::sregex_iterator();
+
+    size_t last_pos = 0;
+    for (std::sregex_iterator i = it; i != end; ++i) {
+        const std::smatch& match = *i;
+        result.append(input_text, last_pos, match.position() - last_pos);
+        result.append(number_to_words(match.str()));
+        last_pos = match.position() + match.length();
+    }
+    result.append(input_text, last_pos);
+
+    return result;
+}
+
+// Based on: https://github.com/edwko/OuteTTS/blob/a613e79c489d8256dd657ea9168d78de75895d82/outetts/version/v1/prompt_processor.py#L39
+static std::string process_text(const std::string & text) {
+
+    // For now I skipped text romanization as I am unsure how to handle
+    // uroman and MeCab implementations in C++
+    // maybe something like https://github.com/anyascii/anyascii/ could work.
+    // currently only English would be supported in this function
+
+    std::string processed_text = replace_numbers_with_words(text);
+
+    std::transform(processed_text.begin(), processed_text.end(),
+                  processed_text.begin(), ::tolower);
+
+    std::regex special_chars(R"([-_/,\.\\])");
+    processed_text = std::regex_replace(processed_text, special_chars, " ");
+
+    std::regex non_alpha(R"([^a-z\s])");
+    processed_text = std::regex_replace(processed_text, non_alpha, "");
+
+    std::regex multiple_spaces(R"(\s+)");
+    processed_text = std::regex_replace(processed_text, multiple_spaces, " ");
+
+    processed_text = std::regex_replace(processed_text, std::regex(R"(^\s+|\s+$)"), "");
+
+    /*
+        Replace spaces with the separator token same as in line 365
+
+        for (auto & c : prompt_user) {
+        if (c == ' ') {
+            prompt_clean += "<|text_sep|>";
+    */
+    processed_text = std::regex_replace(processed_text, std::regex(R"(\s)"), "<|text_sep|>");
+
+    return processed_text;
+}
+
+static void prompt_add(llama_tokens & prompt, llama_token token) {
+    prompt.push_back(token);
+}
+
+static void prompt_add(llama_tokens & prompt, const llama_tokens & tokens) {
+    prompt.insert(prompt.end(), tokens.begin(), tokens.end());
+}
+
+static void prompt_add(llama_tokens & prompt, const llama_model * model, const std::string & txt, bool add_special, bool parse_special) {
+    auto tmp = common_tokenize(model, txt, add_special, parse_special);
+    prompt_add(prompt, tmp);
+}
+
+static void prompt_init(llama_tokens & prompt, const llama_model * model) {
+    prompt.clear();
+
+    prompt_add(prompt, model, "<|im_start|>\n", true, true);
+}
+
+int main(int argc, char ** argv) {
+    common_params params;
+
+    params.prompt = "";
+
+    params.n_predict = 4096;
+    params.n_batch   = 8192;
+    params.n_ctx     = 8192;
+
+    params.sampling.top_k = 4;
+    params.sampling.samplers = { COMMON_SAMPLER_TYPE_TOP_K, };
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_TTS, print_usage)) {
+        return 1;
+    }
+
+    const int n_parallel = params.n_parallel;
+    const int n_predict  = params.n_predict;
+
+    common_init();
+
+    // init LLM
+
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    llama_model * model_ttc = NULL; // text-to-codes
+    llama_model * model_cts = NULL; // codes-to-speech
+
+    llama_context * ctx_ttc = NULL;
+    llama_context * ctx_cts = NULL;
+
+    common_init_result llama_init_ttc = common_init_from_params(params);
+    model_ttc = llama_init_ttc.model;
+    ctx_ttc = llama_init_ttc.context;
+
+    // TODO: refactor in a common struct
+    params.model     = params.vocoder.model;
+    params.model_url = params.vocoder.model_url;
+    params.hf_repo   = params.vocoder.hf_repo;
+    params.hf_file   = params.vocoder.hf_file;
+
+    params.embedding = true;
+
+    common_init_result llama_init_cts = common_init_from_params(params);
+    model_cts = llama_init_cts.model;
+    ctx_cts = llama_init_cts.context;
+
+    std::vector<common_sampler *> smpl(n_parallel);
+    for (int i = 0; i < n_parallel; ++i) {
+        params.sampling.no_perf = (i != 0);
+        params.sampling.seed = params.sampling.seed + 1;
+
+        smpl[i] = common_sampler_init(model_ttc, params.sampling);
+    }
+
+    LOG_INF("sampler seed: %u\n",     common_sampler_get_seed(smpl[0]));
+    LOG_INF("sampler params: \n%s\n", params.sampling.print().c_str());
+    LOG_INF("sampler chain: %s\n",    common_sampler_print(smpl[0]).c_str());
+
+    LOG_INF("%s: loading done\n", __func__);
+
+    const auto t_main_start = ggml_time_us();
+
+    std::vector<llama_token> codes;
+
+    // process prompt and generate voice codes
+    {
+        LOG_INF("%s: constructing prompt ..\n", __func__);
+
+        std::vector<llama_token> prompt_inp;
+
+        prompt_init(prompt_inp, model_ttc);
+
+        prompt_add(prompt_inp, model_ttc, "<|text_start|>the<|text_sep|>overall<|text_sep|>package<|text_sep|>from<|text_sep|>just<|text_sep|>two<|text_sep|>people<|text_sep|>is<|text_sep|>pretty<|text_sep|>remarkable<|text_sep|>sure<|text_sep|>i<|text_sep|>have<|text_sep|>some<|text_sep|>critiques<|text_sep|>about<|text_sep|>some<|text_sep|>of<|text_sep|>the<|text_sep|>gameplay<|text_sep|>aspects<|text_sep|>but<|text_sep|>its<|text_sep|>still<|text_sep|>really<|text_sep|>enjoyable<|text_sep|>and<|text_sep|>it<|text_sep|>looks<|text_sep|>lovely<|text_sep|>", false, true);
+
+        // convert the input text into the necessary format expected by OuteTTS
+        {
+            std::string prompt_clean = process_text(params.prompt);
+
+            LOG_INF("%s: prompt: '%s'\n", __func__, prompt_clean.c_str());
+
+            prompt_add(prompt_inp, model_ttc, prompt_clean, false, true);
+        }
+
+        prompt_add(prompt_inp, model_ttc, "<|text_end|>\n", false, true);
+
+        // disabled to save time on tokenizing each time
+        // TODO: load voices from the json files
+#if 0
+        const std::string voice_data = R"(<|audio_start|>
+the<|t_0.08|><|code_start|><|257|><|740|><|636|><|913|><|788|><|1703|><|code_end|>
+overall<|t_0.36|><|code_start|><|127|><|201|><|191|><|774|><|700|><|532|><|1056|><|557|><|798|><|298|><|1741|><|747|><|1662|><|1617|><|1702|><|1527|><|368|><|1588|><|1049|><|1008|><|1625|><|747|><|1576|><|728|><|1019|><|1696|><|1765|><|code_end|>
+package<|t_0.56|><|code_start|><|935|><|584|><|1319|><|627|><|1016|><|1491|><|1344|><|1117|><|1526|><|1040|><|239|><|1435|><|951|><|498|><|723|><|1180|><|535|><|789|><|1649|><|1637|><|78|><|465|><|1668|><|901|><|595|><|1675|><|117|><|1009|><|1667|><|320|><|840|><|79|><|507|><|1762|><|1508|><|1228|><|1768|><|802|><|1450|><|1457|><|232|><|639|><|code_end|>
+from<|t_0.19|><|code_start|><|604|><|782|><|1682|><|872|><|1532|><|1600|><|1036|><|1761|><|647|><|1554|><|1371|><|653|><|1595|><|950|><|code_end|>
+just<|t_0.25|><|code_start|><|1782|><|1670|><|317|><|786|><|1748|><|631|><|599|><|1155|><|1364|><|1524|><|36|><|1591|><|889|><|1535|><|541|><|440|><|1532|><|50|><|870|><|code_end|>
+two<|t_0.24|><|code_start|><|1681|><|1510|><|673|><|799|><|805|><|1342|><|330|><|519|><|62|><|640|><|1138|><|565|><|1552|><|1497|><|1552|><|572|><|1715|><|1732|><|code_end|>
+people<|t_0.39|><|code_start|><|593|><|274|><|136|><|740|><|691|><|633|><|1484|><|1061|><|1138|><|1485|><|344|><|428|><|397|><|1562|><|645|><|917|><|1035|><|1449|><|1669|><|487|><|442|><|1484|><|1329|><|1832|><|1704|><|600|><|761|><|653|><|269|><|code_end|>
+is<|t_0.16|><|code_start|><|566|><|583|><|1755|><|646|><|1337|><|709|><|802|><|1008|><|485|><|1583|><|652|><|10|><|code_end|>
+pretty<|t_0.32|><|code_start|><|1818|><|1747|><|692|><|733|><|1010|><|534|><|406|><|1697|><|1053|><|1521|><|1355|><|1274|><|816|><|1398|><|211|><|1218|><|817|><|1472|><|1703|><|686|><|13|><|822|><|445|><|1068|><|code_end|>
+remarkable<|t_0.68|><|code_start|><|230|><|1048|><|1705|><|355|><|706|><|1149|><|1535|><|1787|><|1356|><|1396|><|835|><|1583|><|486|><|1249|><|286|><|937|><|1076|><|1150|><|614|><|42|><|1058|><|705|><|681|><|798|><|934|><|490|><|514|><|1399|><|572|><|1446|><|1703|><|1346|><|1040|><|1426|><|1304|><|664|><|171|><|1530|><|625|><|64|><|1708|><|1830|><|1030|><|443|><|1509|><|1063|><|1605|><|1785|><|721|><|1440|><|923|><|code_end|>
+sure<|t_0.36|><|code_start|><|792|><|1780|><|923|><|1640|><|265|><|261|><|1525|><|567|><|1491|><|1250|><|1730|><|362|><|919|><|1766|><|543|><|1|><|333|><|113|><|970|><|252|><|1606|><|133|><|302|><|1810|><|1046|><|1190|><|1675|><|code_end|>
+i<|t_0.08|><|code_start|><|123|><|439|><|1074|><|705|><|1799|><|637|><|code_end|>
+have<|t_0.16|><|code_start|><|1509|><|599|><|518|><|1170|><|552|><|1029|><|1267|><|864|><|419|><|143|><|1061|><|0|><|code_end|>
+some<|t_0.16|><|code_start|><|619|><|400|><|1270|><|62|><|1370|><|1832|><|917|><|1661|><|167|><|269|><|1366|><|1508|><|code_end|>
+critiques<|t_0.60|><|code_start|><|559|><|584|><|1163|><|1129|><|1313|><|1728|><|721|><|1146|><|1093|><|577|><|928|><|27|><|630|><|1080|><|1346|><|1337|><|320|><|1382|><|1175|><|1682|><|1556|><|990|><|1683|><|860|><|1721|><|110|><|786|><|376|><|1085|><|756|><|1523|><|234|><|1334|><|1506|><|1578|><|659|><|612|><|1108|><|1466|><|1647|><|308|><|1470|><|746|><|556|><|1061|><|code_end|>
+about<|t_0.29|><|code_start|><|26|><|1649|><|545|><|1367|><|1263|><|1728|><|450|><|859|><|1434|><|497|><|1220|><|1285|><|179|><|755|><|1154|><|779|><|179|><|1229|><|1213|><|922|><|1774|><|1408|><|code_end|>
+some<|t_0.23|><|code_start|><|986|><|28|><|1649|><|778|><|858|><|1519|><|1|><|18|><|26|><|1042|><|1174|><|1309|><|1499|><|1712|><|1692|><|1516|><|1574|><|code_end|>
+of<|t_0.07|><|code_start|><|197|><|716|><|1039|><|1662|><|64|><|code_end|>
+the<|t_0.08|><|code_start|><|1811|><|1568|><|569|><|886|><|1025|><|1374|><|code_end|>
+gameplay<|t_0.48|><|code_start|><|1269|><|1092|><|933|><|1362|><|1762|><|1700|><|1675|><|215|><|781|><|1086|><|461|><|838|><|1022|><|759|><|649|><|1416|><|1004|><|551|><|909|><|787|><|343|><|830|><|1391|><|1040|><|1622|><|1779|><|1360|><|1231|><|1187|><|1317|><|76|><|997|><|989|><|978|><|737|><|189|><|code_end|>
+aspects<|t_0.56|><|code_start|><|1423|><|797|><|1316|><|1222|><|147|><|719|><|1347|><|386|><|1390|><|1558|><|154|><|440|><|634|><|592|><|1097|><|1718|><|712|><|763|><|1118|><|1721|><|1311|><|868|><|580|><|362|><|1435|><|868|><|247|><|221|><|886|><|1145|><|1274|><|1284|><|457|><|1043|><|1459|><|1818|><|62|><|599|><|1035|><|62|><|1649|><|778|><|code_end|>
+but<|t_0.20|><|code_start|><|780|><|1825|><|1681|><|1007|><|861|><|710|><|702|><|939|><|1669|><|1491|><|613|><|1739|><|823|><|1469|><|648|><|code_end|>
+its<|t_0.09|><|code_start|><|92|><|688|><|1623|><|962|><|1670|><|527|><|599|><|code_end|>
+still<|t_0.27|><|code_start|><|636|><|10|><|1217|><|344|><|713|><|957|><|823|><|154|><|1649|><|1286|><|508|><|214|><|1760|><|1250|><|456|><|1352|><|1368|><|921|><|615|><|5|><|code_end|>
+really<|t_0.36|><|code_start|><|55|><|420|><|1008|><|1659|><|27|><|644|><|1266|><|617|><|761|><|1712|><|109|><|1465|><|1587|><|503|><|1541|><|619|><|197|><|1019|><|817|><|269|><|377|><|362|><|1381|><|507|><|1488|><|4|><|1695|><|code_end|>
+enjoyable<|t_0.49|><|code_start|><|678|><|501|><|864|><|319|><|288|><|1472|><|1341|><|686|><|562|><|1463|><|619|><|1563|><|471|><|911|><|730|><|1811|><|1006|><|520|><|861|><|1274|><|125|><|1431|><|638|><|621|><|153|><|876|><|1770|><|437|><|987|><|1653|><|1109|><|898|><|1285|><|80|><|593|><|1709|><|843|><|code_end|>
+and<|t_0.15|><|code_start|><|1285|><|987|><|303|><|1037|><|730|><|1164|><|502|><|120|><|1737|><|1655|><|1318|><|code_end|>
+it<|t_0.09|><|code_start|><|848|><|1366|><|395|><|1601|><|1513|><|593|><|1302|><|code_end|>
+looks<|t_0.27|><|code_start|><|1281|><|1266|><|1755|><|572|><|248|><|1751|><|1257|><|695|><|1380|><|457|><|659|><|585|><|1315|><|1105|><|1776|><|736|><|24|><|736|><|654|><|1027|><|code_end|>
+lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|1481|><|1721|><|1123|><|438|><|1246|><|1251|><|795|><|659|><|1381|><|1658|><|217|><|1772|><|562|><|952|><|107|><|1129|><|1112|><|467|><|550|><|1079|><|840|><|1615|><|1469|><|1380|><|168|><|917|><|836|><|1827|><|437|><|583|><|67|><|595|><|1087|><|1646|><|1493|><|1677|><|code_end|>)";
+
+        auto tmp = common_tokenize(model_ttc, voice_data, false, true);
+        printf("\n\n");
+        for (int i = 0; i < tmp.size(); ++i) {
+            printf("%d, ", tmp[i]);
+        }
+        printf("\n\n");
+#else
+        prompt_add(prompt_inp, llama_tokens {
+            151667, 198, 1782, 155780, 151669, 151929, 152412, 152308, 152585,
+            152460, 153375, 151670, 198, 74455, 155808, 151669, 151799,
+            151873, 151863, 152446, 152372, 152204, 152728, 152229, 152470,
+            151970, 153413, 152419, 153334, 153289, 153374, 153199, 152040,
+            153260, 152721, 152680, 153297, 152419, 153248, 152400, 152691,
+            153368, 153437, 151670, 198, 1722, 155828, 151669, 152607,
+            152256, 152991, 152299, 152688, 153163, 153016, 152789, 153198,
+            152712, 151911, 153107, 152623, 152170, 152395, 152852, 152207,
+            152461, 153321, 153309, 151750, 152137, 153340, 152573, 152267,
+            153347, 151789, 152681, 153339, 151992, 152512, 151751, 152179,
+            153434, 153180, 152900, 153440, 152474, 153122, 153129, 151904,
+            152311, 151670, 198, 1499, 155791, 151669, 152276, 152454,
+            153354, 152544, 153204, 153272, 152708, 153433, 152319, 153226,
+            153043, 152325, 153267, 152622, 151670, 198, 4250, 155797,
+            151669, 153454, 153342, 151989, 152458, 153420, 152303, 152271,
+            152827, 153036, 153196, 151708, 153263, 152561, 153207, 152213,
+            152112, 153204, 151722, 152542, 151670, 198, 19789, 155796,
+            151669, 153353, 153182, 152345, 152471, 152477, 153014, 152002,
+            152191, 151734, 152312, 152810, 152237, 153224, 153169, 153224,
+            152244, 153387, 153404, 151670, 198, 16069, 155811, 151669,
+            152265, 151946, 151808, 152412, 152363, 152305, 153156, 152733,
+            152810, 153157, 152016, 152100, 152069, 153234, 152317, 152589,
+            152707, 153121, 153341, 152159, 152114, 153156, 153001, 153504,
+            153376, 152272, 152433, 152325, 151941, 151670, 198, 285,
+            155788, 151669, 152238, 152255, 153427, 152318, 153009, 152381,
+            152474, 152680, 152157, 153255, 152324, 151682, 151670, 198,
+            32955, 155804, 151669, 153490, 153419, 152364, 152405, 152682,
+            152206, 152078, 153369, 152725, 153193, 153027, 152946, 152488,
+            153070, 151883, 152890, 152489, 153144, 153375, 152358, 151685,
+            152494, 152117, 152740, 151670, 198, 37448, 480, 155840, 151669,
+            151902, 152720, 153377, 152027, 152378, 152821, 153207, 153459,
+            153028, 153068, 152507, 153255, 152158, 152921, 151958, 152609,
+            152748, 152822, 152286, 151714, 152730, 152377, 152353, 152470,
+            152606, 152162, 152186, 153071, 152244, 153118, 153375, 153018,
+            152712, 153098, 152976, 152336, 151843, 153202, 152297, 151736,
+            153380, 153502, 152702, 152115, 153181, 152735, 153277, 153457,
+            152393, 153112, 152595, 151670, 198, 19098, 155808, 151669,
+            152464, 153452, 152595, 153312, 151937, 151933, 153197, 152239,
+            153163, 152922, 153402, 152034, 152591, 153438, 152215, 151673,
+            152005, 151785, 152642, 151924, 153278, 151805, 151974, 153482,
+            152718, 152862, 153347, 151670, 198, 72, 155780, 151669, 151795,
+            152111, 152746, 152377, 153471, 152309, 151670, 198, 19016,
+            155788, 151669, 153181, 152271, 152190, 152842, 152224, 152701,
+            152939, 152536, 152091, 151815, 152733, 151672, 151670, 198,
+            14689, 155788, 151669, 152291, 152072, 152942, 151734, 153042,
+            153504, 152589, 153333, 151839, 151941, 153038, 153180, 151670,
+            198, 36996, 8303, 155832, 151669, 152231, 152256, 152835,
+            152801, 152985, 153400, 152393, 152818, 152765, 152249, 152600,
+            151699, 152302, 152752, 153018, 153009, 151992, 153054, 152847,
+            153354, 153228, 152662, 153355, 152532, 153393, 151782, 152458,
+            152048, 152757, 152428, 153195, 151906, 153006, 153178, 153250,
+            152331, 152284, 152780, 153138, 153319, 151980, 153142, 152418,
+            152228, 152733, 151670, 198, 9096, 155801, 151669, 151698,
+            153321, 152217, 153039, 152935, 153400, 152122, 152531, 153106,
+            152169, 152892, 152957, 151851, 152427, 152826, 152451, 151851,
+            152901, 152885, 152594, 153446, 153080, 151670, 198, 14689,
+            155795, 151669, 152658, 151700, 153321, 152450, 152530, 153191,
+            151673, 151690, 151698, 152714, 152846, 152981, 153171, 153384,
+            153364, 153188, 153246, 151670, 198, 1055, 155779, 151669,
+            151869, 152388, 152711, 153334, 151736, 151670, 198, 1782,
+            155780, 151669, 153483, 153240, 152241, 152558, 152697, 153046,
+            151670, 198, 5804, 1363, 155820, 151669, 152941, 152764, 152605,
+            153034, 153434, 153372, 153347, 151887, 152453, 152758, 152133,
+            152510, 152694, 152431, 152321, 153088, 152676, 152223, 152581,
+            152459, 152015, 152502, 153063, 152712, 153294, 153451, 153032,
+            152903, 152859, 152989, 151748, 152669, 152661, 152650, 152409,
+            151861, 151670, 198, 300, 7973, 155828, 151669, 153095, 152469,
+            152988, 152894, 151819, 152391, 153019, 152058, 153062, 153230,
+            151826, 152112, 152306, 152264, 152769, 153390, 152384, 152435,
+            152790, 153393, 152983, 152540, 152252, 152034, 153107, 152540,
+            151919, 151893, 152558, 152817, 152946, 152956, 152129, 152715,
+            153131, 153490, 151734, 152271, 152707, 151734, 153321, 152450,
+            151670, 198, 8088, 155792, 151669, 152452, 153497, 153353,
+            152679, 152533, 152382, 152374, 152611, 153341, 153163, 152285,
+            153411, 152495, 153141, 152320, 151670, 198, 1199, 155781,
+            151669, 151764, 152360, 153295, 152634, 153342, 152199, 152271,
+            151670, 198, 43366, 155799, 151669, 152308, 151682, 152889,
+            152016, 152385, 152629, 152495, 151826, 153321, 152958, 152180,
+            151886, 153432, 152922, 152128, 153024, 153040, 152593, 152287,
+            151677, 151670, 198, 53660, 155808, 151669, 151727, 152092,
+            152680, 153331, 151699, 152316, 152938, 152289, 152433, 153384,
+            151781, 153137, 153259, 152175, 153213, 152291, 151869, 152691,
+            152489, 151941, 152049, 152034, 153053, 152179, 153160, 151676,
+            153367, 151670, 198, 268, 4123, 480, 155821, 151669, 152350,
+            152173, 152536, 151991, 151960, 153144, 153013, 152358, 152234,
+            153135, 152291, 153235, 152143, 152583, 152402, 153483, 152678,
+            152192, 152533, 152946, 151797, 153103, 152310, 152293, 151825,
+            152548, 153442, 152109, 152659, 153325, 152781, 152570, 152957,
+            151752, 152265, 153381, 152515, 151670, 198, 437, 155787,
+            151669, 152957, 152659, 151975, 152709, 152402, 152836, 152174,
+            151792, 153409, 153327, 152990, 151670, 198, 275, 155781,
+            151669, 152520, 153038, 152067, 153273, 153185, 152265, 152974,
+            151670, 198, 94273, 155799, 151669, 152953, 152938, 153427,
+            152244, 151920, 153423, 152929, 152367, 153052, 152129, 152331,
+            152257, 152987, 152777, 153448, 152408, 151696, 152408, 152326,
+            152699, 151670, 198, 385, 16239, 155828, 151669, 152306, 152268,
+            153438, 153228, 152978, 152957, 153153, 153393, 152795, 152110,
+            152918, 152923, 152467, 152331, 153053, 153330, 151889, 153444,
+            152234, 152624, 151779, 152801, 152784, 152139, 152222, 152751,
+            152512, 153287, 153141, 153052, 151840, 152589, 152508, 153499,
+            152109, 152255, 151739, 152267, 152759, 153318, 153165, 153349,
+            151670,});
+#endif
+
+        // print the prompt token-by-token
+
+        LOG("\n");
+
+        for (auto id : prompt_inp) {
+            LOG("%s", common_token_to_piece(ctx_ttc, id).c_str());
+        }
+
+        LOG_INF("%s: prompt size: %d\n", __func__, (int) prompt_inp.size());
+
+        LOG("\n");
+
+        // create a llama_batch
+        // we use this object to submit token data for decoding
+        llama_batch batch = llama_batch_init(std::max(prompt_inp.size(), (size_t) n_parallel), 0, n_parallel);
+
+        std::vector<llama_seq_id> seq_ids(n_parallel, 0);
+        for (int32_t i = 0; i < n_parallel; ++i) {
+            seq_ids[i] = i;
+        }
+
+        // evaluate the initial prompt
+        for (size_t i = 0; i < prompt_inp.size(); ++i) {
+            common_batch_add(batch, prompt_inp[i], i, seq_ids, false);
+        }
+        GGML_ASSERT(batch.n_tokens == (int) prompt_inp.size());
+
+        // llama_decode will output logits only for the last token of the prompt
+        batch.logits[batch.n_tokens - 1] = true;
+
+        if (llama_decode(ctx_ttc, batch) != 0) {
+            LOG_ERR("%s: llama_decode() failed\n", __func__);
+            return 1;
+        }
+
+        if (n_parallel > 1) {
+            LOG_INF("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
+        }
+
+        llama_synchronize(ctx_ttc);
+
+        LOG_INF("%s: time for prompt: %.3f ms\n\n", __func__, (ggml_time_us() - t_main_start) / 1000.0f);
+
+        const auto t_dec_start = ggml_time_us();
+
+        // main loop
+
+        // remember the batch index of the last token for each parallel sequence
+        // we need this to determine which logits to sample from
+        std::vector<int32_t> i_batch(n_parallel, batch.n_tokens - 1);
+
+        int n_past   = batch.n_tokens;
+        int n_decode = 0;
+
+        while (n_decode <= n_predict) {
+            // prepare the next batch
+            common_batch_clear(batch);
+
+            // sample the next token for each parallel sequence / stream
+            for (int32_t i = 0; i < n_parallel; ++i) {
+                if (i_batch[i] < 0) {
+                    // the stream has already finished
+                    continue;
+                }
+
+                const llama_token new_token_id = common_sampler_sample(smpl[i], ctx_ttc, i_batch[i]);
+
+                common_sampler_accept(smpl[i], new_token_id, true);
+
+                codes.push_back(new_token_id);
+
+                const auto * cands = common_sampler_get_candidates(smpl[i]);
+
+                // is it an end of generation? -> mark the stream as finished
+                if (llama_token_is_eog(model_ttc, new_token_id) || n_decode == n_predict) {
+                    std::string reason;
+                    if (llama_token_is_eog(model_ttc, new_token_id)) {
+                        reason = "eos";
+                    } else {
+                        reason = "n_predict";
+                    }
+
+                    i_batch[i] = -1;
+
+                    LOG("\n");
+                    if (n_parallel > 1) {
+                        LOG_CNT("\n");
+                        LOG_INF("%s: stream %d finished at n_past = %d, reason = '%s'\n", __func__, i, n_past, reason.c_str());
+                    }
+
+                    continue;
+                }
+
+                {
+                    const float p = cands->data[cands->selected].p;
+
+                    const int col = std::max(0, std::min((int) k_colors.size() - 1, (int) ((3*p)*float(k_colors.size()))));
+
+                    LOG_CNT("%s%d%s", k_colors[col].c_str(), i, "\033[0m");
+                    //LOG_CNT("%d", i);
+                }
+
+                i_batch[i] = batch.n_tokens;
+
+                // push this new token for next evaluation
+                common_batch_add(batch, new_token_id, n_past, { i }, true);
+            }
+
+            // all streams are finished
+            if (batch.n_tokens == 0) {
+                break;
+            }
+
+            n_decode += 1;
+            n_past += 1;
+
+            // evaluate the current batch with the transformer model
+            if (llama_decode(ctx_ttc, batch)) {
+                LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
+                return 1;
+            }
+        }
+
+        llama_batch_free(batch);
+
+        LOG("\n");
+        LOG_INF("%s: time for decoder:       %.3f ms\n", __func__, (ggml_time_us() - t_dec_start) / 1000.0f);
+    }
+
+    common_perf_print(ctx_ttc, smpl[0]);
+
+    //std::vector<llama_token> codes = {198, 88225, 155856, 151669, 152205,
+    //    153064, 152537, 153421, 153209, 152524, 151689, 152993, 152438, 152695,
+    //    153091, 152945, 152829, 152534, 152934, 153020, 151997, 152263, 153010,
+    //    153146, 152399, 153208, 152496, 151793, 152848, 152263, 152571, 153286,
+    //    152227, 153300, 152934, 152263, 153208, 152263, 152965, 152430, 152296,
+    //    153146, 152920, 152376, 152556, 153363, 151775, 152044, 152972, 152690,
+    //    153379, 152368, 152233, 153422, 152490, 151996, 152022, 151694, 152061,
+    //    153238, 152539, 153356, 152640, 153021, 153123, 151962, 153094, 151670,
+    //    198, 20339, 13189, 155824, 151669, 152070, 152007, 152910, 151683,
+    //    152000, 152373, 152760, 152046, 151735, 152334, 152394, 153073, 152908,
+    //    151856, 151953, 153247, 153293, 151903, 153480, 153168, 152478, 153359,
+    //    153429, 151905, 151678, 152567, 152411, 152165, 152556, 153075, 153424,
+    //    151993, 152999, 153078, 152151, 152088, 153389, 152484, 151874, 151670,
+    //    198, 285, 155784, 151669, 152226, 152126, 152638, 153215, 151729,
+    //    152959, 153479, 153059, 151838, 151670, 198, 1782, 155783, 151669,
+    //    153288, 153055, 153314, 152497, 152962, 152741, 152076, 153253, 151670,
+    //    198, 471, 16488, 155825, 151669, 152060, 152916, 151893, 153469, 152501,
+    //    152080, 152743, 151932, 153161, 152096, 152761, 152698, 153401, 153242,
+    //    153336, 152441, 152838, 153467, 152706, 153496, 153310, 152422, 153360,
+    //    153115, 152763, 151998, 152373, 153450, 152554, 151968, 153323, 152055,
+    //    152468, 153111, 153358, 152813, 152010, 151770, 152823, 152960, 151670,
+    //    198, 22627, 155823, 151669, 152814, 152366, 153484, 152931, 153441,
+    //    152164, 152877, 152915, 153463, 151692, 152911, 152747, 152776, 151831,
+    //    153449, 151882, 152975, 152031, 152513, 153150, 152448, 152667, 153133,
+    //    153189, 152619, 153466, 152054, 152106, 153119, 152277, 152439, 153109,
+    //    152997, 152141, 153154, 153256, 153311, 151922, 151670, 198, 1055,
+    //    155781, 151669, 152633, 151850, 153060, 153270, 152560, 153348, 152729,
+    //    151670, 198, 25312, 155803, 151669, 152521, 153403, 152561, 153337,
+    //    153383, 152199, 153493, 153326, 151830, 152254, 152248, 152349, 152153,
+    //    153007, 151823, 153037, 152575, 152457, 152406, 152592, 153116, 153365,
+    //    153456, 151670, 198, 88225, 155817, 151669, 153271, 151925, 152218,
+    //    152418, 152253, 153140, 151903, 153151, 152626, 152338, 152647, 153464,
+    //    152785, 152768, 151711, 152037, 152033, 151804, 152216, 151701, 151855,
+    //    152348, 152995, 152955, 152905, 152342, 152340, 153391, 153453, 152418,
+    //    153415, 151990, 153083, 152884, 151670, 198, 151668, 198, 151645};
+
+    {
+        const std::string inp_txt = common_detokenize(ctx_ttc, codes, true);
+
+        LOG("\n");
+        LOG_INF("codes: '%s'\n", inp_txt.c_str());
+        LOG_INF("%s: codes size: %d\n", __func__, (int) codes.size());
+    }
+
+    // remove all non-audio tokens (i.e. < 151672 || > 155772)
+    codes.erase(std::remove_if(codes.begin(), codes.end(), [](llama_token t) { return t < 151672 || t > 155772; }), codes.end());
+
+    {
+        const std::string inp_txt = common_detokenize(ctx_ttc, codes, true);
+        LOG_INF("codes audio: '%s'\n", inp_txt.c_str());
+        LOG_INF("%s: codes audio size: %d\n", __func__, (int) codes.size());
+    }
+
+    for (auto & token : codes) {
+        token -= 151672;
+    }
+
+    const auto t_voc_start = ggml_time_us();
+
+    const int n_codes = codes.size();
+
+    llama_batch batch = llama_batch_init(n_codes, 0, 1);
+
+    for (size_t i = 0; i < codes.size(); ++i) {
+        common_batch_add(batch, codes[i], i, { 0 }, true); // TODO: all logits?
+    }
+    GGML_ASSERT(batch.n_tokens == n_codes);
+
+    if (llama_decode(ctx_cts, batch) != 0) {
+        LOG_ERR("%s: llama_decode() failed\n", __func__);
+        return 1;
+    }
+
+    llama_synchronize(ctx_cts);
+
+    LOG_INF("%s: time for vocoder:      %.3f ms\n", __func__, (ggml_time_us() - t_voc_start) / 1000.0f);
+
+    const auto t_spec_start = ggml_time_us();
+
+#if 1
+    // spectral operations
+    const int n_embd = llama_n_embd(model_cts);
+    const float * embd = llama_get_embeddings(ctx_cts);
+
+    auto audio = embd_to_audio(embd, n_codes, n_embd, params.cpuparams.n_threads);
+
+#else
+    // read the spectrogram from a file for debugging purposes
+    std::vector<float> audio;
+    {
+        std::ifstream fin("out.bin", std::ios::binary);
+        if (!fin) {
+            LOG_ERR("%s: failed to open file '%s'\n", __func__, "out.bin");
+            return 1;
+        }
+
+        std::vector<float> embd;
+
+        int n_codes;
+        int n_embd;
+
+        fin.read(reinterpret_cast<char *>(&n_codes), sizeof(int));
+        fin.read(reinterpret_cast<char *>(&n_embd), sizeof(int));
+
+        embd.resize(n_codes * n_embd);
+        fin.read(reinterpret_cast<char *>(embd.data()), n_codes * n_embd * sizeof(float));
+        fin.close();
+
+        LOG_INF("%s: n_codes: %d, n_embd: %d\n", __func__, n_codes, n_embd);
+
+        audio = embd_to_audio(embd.data(), n_codes, n_embd, params.cpuparams.n_threads);
+    }
+#endif
+
+    const std::string fname = "output.wav";
+
+    const int n_sr = 24000; // sampling rate
+
+    // zero out first 0.25 seconds
+    for (int i = 0; i < 24000/4; ++i) {
+        audio[i] = 0.0f;
+    }
+
+    LOG_INF("%s: time for spectral ops: %.3f ms\n", __func__, (ggml_time_us() - t_spec_start) / 1000.0f);
+    LOG_INF("%s: total time:            %.3f ms\n", __func__, (ggml_time_us() - t_main_start) / 1000.0f);
+
+    save_wav16(fname, audio, n_sr);
+
+    LOG_INF("%s: audio written to file '%s'\n", __func__, fname.c_str());
+
+    llama_free(ctx_ttc);
+    llama_free_model(model_ttc);
+
+    llama_free(ctx_cts);
+    llama_free_model(model_cts);
+
+    llama_backend_free();
+
+    return 0;
+}
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 3438710f06b..22799cd367c 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -32,6 +32,13 @@ else()
     endif()
 endif()
 
+# remove the lib prefix on win32 mingw
+if (WIN32)
+    set(CMAKE_STATIC_LIBRARY_PREFIX "")
+    set(CMAKE_SHARED_LIBRARY_PREFIX "")
+    set(CMAKE_SHARED_MODULE_PREFIX  "")
+endif()
+
 option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
 option(GGML_BACKEND_DL   "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)
 
@@ -67,10 +74,10 @@ if (NOT GGML_CUDA_GRAPHS_DEFAULT)
 endif()
 
 # general
-option(GGML_STATIC "ggml: static link libraries"         OFF)
-option(GGML_NATIVE "ggml: enable -march=native flag"     ${GGML_NATIVE_DEFAULT})
-option(GGML_LTO    "ggml: enable link time optimization" OFF)
-option(GGML_CCACHE "ggml: use ccache if available"       ON)
+option(GGML_STATIC "ggml: static link libraries"                     OFF)
+option(GGML_NATIVE "ggml: optimize the build for the current system" ${GGML_NATIVE_DEFAULT})
+option(GGML_LTO    "ggml: enable link time optimization"             OFF)
+option(GGML_CCACHE "ggml: use ccache if available"                   ON)
 
 # debug
 option(GGML_ALL_WARNINGS           "ggml: enable all compiler warnings"                   ON)
@@ -113,8 +120,9 @@ endif()
 option(GGML_LASX             "ggml: enable lasx"             ON)
 option(GGML_LSX              "ggml: enable lsx"              ON)
 option(GGML_RVV              "ggml: enable rvv"              ON)
-option(GGML_SVE              "ggml: enable SVE"              OFF)
+
 option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
+set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
 
 
 if (WIN32)
@@ -172,6 +180,11 @@ set   (GGML_SYCL_TARGET "INTEL" CACHE STRING
 set   (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
                                             "ggml: sycl device architecture")
 
+option(GGML_OPENCL                          "ggml: use OpenCL"                                OFF)
+option(GGML_OPENCL_PROFILING                "ggml: use OpenCL profiling (increases overhead)" OFF)
+option(GGML_OPENCL_EMBED_KERNELS            "ggml: embed kernels"                             ON)
+option(GGML_OPENCL_USE_ADRENO_KERNELS       "ggml: use optimized kernels for Adreno"          ON)
+
 # extra artifacts
 option(GGML_BUILD_TESTS    "ggml: build tests"    ${GGML_STANDALONE})
 option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
diff --git a/ggml/include/ggml-opencl.h b/ggml/include/ggml-opencl.h
new file mode 100644
index 00000000000..6b61771358f
--- /dev/null
+++ b/ggml/include/ggml-opencl.h
@@ -0,0 +1,26 @@
+#ifndef GGML_OPENCL_H
+#define GGML_OPENCL_H
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+//
+// backend API
+//
+GGML_BACKEND_API ggml_backend_t ggml_backend_opencl_init(void);
+GGML_BACKEND_API bool ggml_backend_is_opencl(ggml_backend_t backend);
+
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void);
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_opencl_reg(void);
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif // GGML_OPENCL_H
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 386d5a15d81..c714fc8c837 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -237,7 +237,9 @@
 #define GGML_EXIT_SUCCESS 0
 #define GGML_EXIT_ABORTED 1
 
-#define GGML_ROPE_TYPE_NEOX 2
+#define GGML_ROPE_TYPE_NEOX   2
+#define GGML_ROPE_TYPE_MROPE  8
+#define GGML_ROPE_TYPE_VISION 24
 
 #define GGUF_MAGIC "GGUF"
 
@@ -1443,6 +1445,22 @@ extern "C" {
             float                 beta_fast,
             float                 beta_slow);
 
+    GGML_API struct ggml_tensor * ggml_rope_multi(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            struct ggml_tensor  * c,
+            int                   n_dims,
+            int                   sections[4],
+            int                   mode,
+            int                   n_ctx_orig,
+            float                 freq_base,
+            float                 freq_scale,
+            float                 ext_factor,
+            float                 attn_factor,
+            float                 beta_fast,
+            float                 beta_slow);
+
     // in-place, returns view(a)
     GGML_API struct ggml_tensor * ggml_rope_ext_inplace(
             struct ggml_context * ctx,
@@ -1546,17 +1564,6 @@ extern "C" {
         int                   d1, // dilation dimension 1
         bool                  is_2D);
 
-    GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,  // convolution kernel
-            struct ggml_tensor  * b,  // data
-            int                  s0,  // stride dimension 0
-            int                  s1,  // stride dimension 1
-            int                  p0,  // padding dimension 0
-            int                  p1,  // padding dimension 1
-            int                  d0,  // dilation dimension 0
-            int                  d1); // dilation dimension 1
-
     GGML_API struct ggml_tensor * ggml_conv_1d(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,   // convolution kernel
@@ -1574,6 +1581,23 @@ extern "C" {
             int                   s,  // stride
             int                   d); // dilation
 
+    // depthwise
+    // TODO: this is very likely wrong for some cases! - needs more testing
+    GGML_API struct ggml_tensor * ggml_conv_1d_dw(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,   // convolution kernel
+            struct ggml_tensor  * b,   // data
+            int                   s0,  // stride
+            int                   p0,  // padding
+            int                   d0); // dilation
+
+    GGML_API struct ggml_tensor * ggml_conv_1d_dw_ph(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,   // convolution kernel
+            struct ggml_tensor  * b,   // data
+            int                   s0,  // stride
+            int                   d0); // dilation
+
     GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,   // convolution kernel
@@ -1593,7 +1617,6 @@ extern "C" {
             int                   d0,  // dilation dimension 0
             int                   d1); // dilation dimension 1
 
-
     // kernel size is a->ne[0] x a->ne[1]
     // stride is equal to kernel size
     // padding is zero
@@ -1620,6 +1643,18 @@ extern "C" {
             struct ggml_tensor  * a,
             struct ggml_tensor  * b);
 
+    // depthwise
+    GGML_API struct ggml_tensor * ggml_conv_2d_dw(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,  // convolution kernel
+            struct ggml_tensor  * b,  // data
+            int                  s0,  // stride dimension 0
+            int                  s1,  // stride dimension 1
+            int                  p0,  // padding dimension 0
+            int                  p1,  // padding dimension 1
+            int                  d0,  // dilation dimension 0
+            int                  d1); // dilation dimension 1
+
     GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index d5864d5dad3..d18a5d31f2f 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -194,11 +194,6 @@ endif()
 
 if (WIN32)
     add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
-
-    if (BUILD_SHARED_LIBS)
-        # TODO: should not use this
-        set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
-    endif()
 endif()
 
 # ggml
@@ -247,6 +242,7 @@ function(ggml_add_backend_library backend)
         # write the shared library to the output directory
         set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
         target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
+        add_dependencies(ggml ${backend})
     else()
         add_library(${backend} ${ARGN})
         target_link_libraries(ggml PUBLIC ${backend})
@@ -321,6 +317,7 @@ ggml_add_backend(MUSA)
 ggml_add_backend(RPC)
 ggml_add_backend(SYCL)
 ggml_add_backend(Vulkan)
+ggml_add_backend(OpenCL)
 
 foreach (target ggml-base ggml)
     target_include_directories(${target} PUBLIC    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index 2b2240be85d..8dc8226ac49 100644
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -534,7 +534,6 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
         size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node);
         hn->buffer_id = buffer_id;
         hn->offset = offset;
-        return;
     }
 }
 
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 2e7340145ee..7ddd178b5f3 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -46,6 +46,10 @@
 #include "ggml-vulkan.h"
 #endif
 
+#ifdef GGML_USE_OPENCL
+#include "ggml-opencl.h"
+#endif
+
 #ifdef GGML_USE_BLAS
 #include "ggml-blas.h"
 #endif
@@ -62,6 +66,26 @@
 #include "ggml-kompute.h"
 #endif
 
+// disable C++17 deprecation warning for std::codecvt_utf8
+#if defined(__clang__)
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wdeprecated-declarations"
+#endif
+
+static std::wstring utf8_to_utf16(const std::string & str) {
+    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
+    return converter.from_bytes(str);
+}
+
+static std::string utf16_to_utf8(const std::wstring & str) {
+    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
+    return converter.to_bytes(str);
+}
+
+#if defined(__clang__)
+#    pragma clang diagnostic pop
+#endif
+
 #ifdef _WIN32
 
 using dl_handle = std::remove_pointer_t<HMODULE>;
@@ -84,11 +108,6 @@ static dl_handle * dl_load_library(const std::wstring & path) {
     return handle;
 }
 
-static dl_handle * dl_load_library(const std::string & path) {
-    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
-    return dl_load_library(converter.from_bytes(path));
-}
-
 static void * dl_get_sym(dl_handle * handle, const char * name) {
     DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
     SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
@@ -110,8 +129,8 @@ struct dl_handle_deleter {
     }
 };
 
-static void * dl_load_library(const std::string & path) {
-    dl_handle * handle = dlopen(path.c_str(), RTLD_NOW | RTLD_LOCAL);
+static void * dl_load_library(const std::wstring & path) {
+    dl_handle * handle = dlopen(utf16_to_utf8(path).c_str(), RTLD_NOW | RTLD_LOCAL);
 
     return handle;
 }
@@ -146,6 +165,9 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_VULKAN
         register_backend(ggml_backend_vk_reg());
 #endif
+#ifdef GGML_USE_OPENCL
+        register_backend(ggml_backend_opencl_reg());
+#endif
 #ifdef GGML_USE_CANN
         register_backend(ggml_backend_cann_reg());
 #endif
@@ -195,11 +217,11 @@ struct ggml_backend_registry {
         devices.push_back(device);
     }
 
-    ggml_backend_reg_t load_backend(const char * path, bool silent) {
+    ggml_backend_reg_t load_backend(const std::wstring & path, bool silent) {
         dl_handle_ptr handle { dl_load_library(path) };
         if (!handle) {
             if (!silent) {
-                GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path);
+                GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(path).c_str());
             }
             return nullptr;
         }
@@ -207,7 +229,7 @@ struct ggml_backend_registry {
         auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
         if (score_fn && score_fn() == 0) {
             if (!silent) {
-                GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path);
+                GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, utf16_to_utf8(path).c_str());
             }
             return nullptr;
         }
@@ -215,7 +237,7 @@ struct ggml_backend_registry {
         auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init");
         if (!backend_init_fn) {
             if (!silent) {
-                GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, path);
+                GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, utf16_to_utf8(path).c_str());
             }
             return nullptr;
         }
@@ -224,16 +246,16 @@ struct ggml_backend_registry {
         if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
             if (!silent) {
                 if (!reg) {
-                    GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path);
+                    GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, utf16_to_utf8(path).c_str());
                 } else {
                     GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
-                        __func__, path, reg->api_version, GGML_BACKEND_API_VERSION);
+                        __func__, utf16_to_utf8(path).c_str(), reg->api_version, GGML_BACKEND_API_VERSION);
                 }
             }
             return nullptr;
         }
 
-        GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path);
+        GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), utf16_to_utf8(path).c_str());
 
         register_backend(reg, std::move(handle));
 
@@ -369,14 +391,14 @@ ggml_backend_t ggml_backend_init_best(void) {
 
 // Dynamic loading
 ggml_backend_reg_t ggml_backend_load(const char * path) {
-    return get_reg().load_backend(path, false);
+    return get_reg().load_backend(utf8_to_utf16(path), false);
 }
 
 void ggml_backend_unload(ggml_backend_reg_t reg) {
     get_reg().unload_backend(reg, true);
 }
 
-static std::string get_executable_path() {
+static std::wstring get_executable_path() {
 #if defined(__APPLE__)
     // get executable path
     std::vector<char> path;
@@ -394,13 +416,17 @@ static std::string get_executable_path() {
     if (last_slash != std::string::npos) {
         base_path = base_path.substr(0, last_slash);
     }
-    return base_path + "/";
-#elif defined(__linux__)
+    return utf8_to_utf16(base_path + "/");
+#elif defined(__linux__) || defined(__FreeBSD__)
     std::string base_path = ".";
     std::vector<char> path(1024);
     while (true) {
         // get executable path
+#    if defined(__linux__)
         ssize_t len = readlink("/proc/self/exe", path.data(), path.size());
+#    elif defined(__FreeBSD__)
+        ssize_t len = readlink("/proc/curproc/file", path.data(), path.size());
+#    endif
         if (len == -1) {
             break;
         }
@@ -416,86 +442,93 @@ static std::string get_executable_path() {
         path.resize(path.size() * 2);
     }
 
-    return base_path + "/";
+    return utf8_to_utf16(base_path + "/");
 #elif defined(_WIN32)
-    std::vector<char> path(MAX_PATH);
-    DWORD len = GetModuleFileNameA(NULL, path.data(), path.size());
+    std::vector<wchar_t> path(MAX_PATH);
+    DWORD len = GetModuleFileNameW(NULL, path.data(), path.size());
     if (len == 0) {
-        return "";
+        return {};
     }
-    std::string base_path(path.data(), len);
+    std::wstring base_path(path.data(), len);
     // remove executable name
     auto last_slash = base_path.find_last_of('\\');
     if (last_slash != std::string::npos) {
         base_path = base_path.substr(0, last_slash);
     }
-    return base_path + "\\";
+    return base_path + L"\\";
+#else
+    return {};
+#endif
+}
+
+static std::wstring backend_filename_prefix() {
+#ifdef _WIN32
+    return L"ggml-";
+#else
+    return L"libggml-";
 #endif
 }
 
-static std::string backend_filename_prefix() {
+static std::wstring backend_filename_suffix() {
 #ifdef _WIN32
-    return "ggml-";
+    return L".dll";
 #else
-    return "libggml-";
+    return L".so";
 #endif
 }
 
-static std::string backend_filename_suffix() {
+static std::wstring path_separator() {
 #ifdef _WIN32
-    return ".dll";
+    return L"\\";
 #else
-    return ".so";
+    return L"/";
 #endif
 }
 
 static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) {
     // enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
      // TODO: search system paths
-    std::string file_prefix = backend_filename_prefix() + name + "-";
-    std::vector<std::string> search_paths;
+    std::wstring file_prefix = backend_filename_prefix() + utf8_to_utf16(name) + L"-";
+    std::vector<std::wstring> search_paths;
     if (user_search_path == nullptr) {
-        search_paths.push_back("./");
+        search_paths.push_back(L"." + path_separator());
         search_paths.push_back(get_executable_path());
     } else {
-#if defined(_WIN32)
-        search_paths.push_back(std::string(user_search_path) + "\\");
-#else
-        search_paths.push_back(std::string(user_search_path) + "/");
-#endif
+        search_paths.push_back(utf8_to_utf16(user_search_path) + path_separator());
     }
 
     int best_score = 0;
-    std::string best_path;
+    std::wstring best_path;
 
     namespace fs = std::filesystem;
     for (const auto & search_path : search_paths) {
         if (!fs::exists(search_path)) {
             continue;
         }
-        for (const auto & entry : fs::directory_iterator(search_path)) {
+        fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
+        for (const auto & entry : dir_it) {
             if (entry.is_regular_file()) {
-                std::string filename = entry.path().filename().string();
-                std::string ext = entry.path().extension().string();
+                std::wstring filename = entry.path().filename().wstring();
+                std::wstring ext = entry.path().extension().wstring();
                 if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
-                    dl_handle_ptr handle { dl_load_library(entry.path().c_str()) };
+                    dl_handle_ptr handle { dl_load_library(entry.path().wstring()) };
                     if (!handle && !silent) {
-                        GGML_LOG_ERROR("%s: failed to load %s\n", __func__, entry.path().string().c_str());
+                        GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
                     }
                     if (handle) {
                         auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
                         if (score_fn) {
                             int s = score_fn();
 #ifndef NDEBUG
-                            GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, entry.path().string().c_str(), s);
+                            GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str(), s);
 #endif
                             if (s > best_score) {
                                 best_score = s;
-                                best_path = entry.path().string();
+                                best_path = entry.path().wstring();
                             }
                         } else {
                             if (!silent) {
-                                GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, entry.path().string().c_str());
+                                GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
                             }
                         }
                     }
@@ -507,15 +540,15 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
     if (best_score == 0) {
         // try to load the base backend
         for (const auto & search_path : search_paths) {
-            std::string path = search_path + backend_filename_prefix() + name + backend_filename_suffix();
+            std::wstring path = search_path + backend_filename_prefix() + utf8_to_utf16(name) + backend_filename_suffix();
             if (fs::exists(path)) {
-                return get_reg().load_backend(path.c_str(), silent);
+                return get_reg().load_backend(path, silent);
             }
         }
         return nullptr;
     }
 
-    return get_reg().load_backend(best_path.c_str(), silent);
+    return get_reg().load_backend(best_path, silent);
 }
 
 void ggml_backend_load_all() {
@@ -538,6 +571,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
     ggml_backend_load_best("rpc", silent, dir_path);
     ggml_backend_load_best("sycl", silent, dir_path);
     ggml_backend_load_best("vulkan", silent, dir_path);
+    ggml_backend_load_best("opencl", silent, dir_path);
     ggml_backend_load_best("musa", silent, dir_path);
     ggml_backend_load_best("cpu", silent, dir_path);
 }
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index fa04ab84f3f..d410c02445c 100644
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -1747,6 +1747,15 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
             if (*ext_factor != 0) {
                 return false;
             }
+
+            const int mode = ((const int32_t *) op->op_params)[2];
+            if (mode & GGML_ROPE_TYPE_MROPE) {
+                return false;
+            }
+            if (mode & GGML_ROPE_TYPE_VISION) {
+                return false;
+            }
+
             return true;
         }
         case GGML_OP_UPSCALE: {
diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt
index 0e055670362..f0aecac1bd1 100644
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -74,112 +74,96 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
 
     if (CMAKE_OSX_ARCHITECTURES      STREQUAL "arm64" OR
         CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
-        (NOT CMAKE_OSX_ARCHITECTURES      AND
-        NOT CMAKE_GENERATOR_PLATFORM_LWR AND
+        (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
             CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
 
         message(STATUS "ARM detected")
 
-        if (MSVC)
-            list(APPEND ARCH_DEFINITIONS __aarch64__) # MSVC defines _M_ARM64 instead
-            list(APPEND ARCH_DEFINITIONS __ARM_NEON)
-            list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FMA)
-
-            set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS})
-            string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2")
-
-            check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
-            if (GGML_COMPILER_SUPPORT_DOTPROD)
-                list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD)
-
-                message(STATUS "ARM feature DOTPROD enabled")
-            endif ()
-
-            check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
-
-            if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
-                list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8)
-
-                message(STATUS "ARM feature MATMUL_INT8 enabled")
-            endif ()
-
-            check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
-            if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
-                list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-
-                message(STATUS "ARM feature FP16_VECTOR_ARITHMETIC enabled")
-            endif ()
+        if (MSVC AND NOT CMAKE_C_COMPILER_ID STREQUAL "Clang")
+            message(FATAL_ERROR "MSVC is not supported for ARM, use clang")
+        else()
+            check_cxx_compiler_flag(-mfp16-format=ieee GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E)
+            if (NOT "${GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
+                list(APPEND ARCH_FLAGS -mfp16-format=ieee)
+            endif()
 
-            set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV})
-        elseif (APPLE)
             if (GGML_NATIVE)
-                set(USER_PROVIDED_MARCH FALSE)
-                foreach(flag_var IN ITEMS CMAKE_C_FLAGS CMAKE_CXX_FLAGS CMAKE_REQUIRED_FLAGS)
-                    if ("${${flag_var}}" MATCHES "-march=[a-zA-Z0-9+._-]+")
-                        set(USER_PROVIDED_MARCH TRUE)
-                        break()
-                    endif()
-                endforeach()
-
-                if (NOT USER_PROVIDED_MARCH)
-                    set(MARCH_FLAGS "-march=armv8.2a")
-
-                    check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
-                    if (GGML_COMPILER_SUPPORT_DOTPROD)
-                        set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod")
-                        list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD)
-
-                        message(STATUS "ARM feature DOTPROD enabled")
-                    endif ()
+                # -mcpu=native does not always enable all the features in some compilers,
+                # so we check for them manually and enable them if available
+
+                execute_process(
+                    COMMAND ${CMAKE_C_COMPILER} -mcpu=native -E -v -
+                    INPUT_FILE "/dev/null"
+                    OUTPUT_QUIET
+                    ERROR_VARIABLE ARM_MCPU
+                    RESULT_VARIABLE ARM_MCPU_RESULT
+                )
+                if (NOT ARM_MCPU_RESULT)
+                    string(REGEX MATCH "-mcpu=[^ ']+" ARM_MCPU_FLAG "${ARM_MCPU}")
+                endif()
+                if ("${ARM_MCPU_FLAG}" STREQUAL "")
+                    set(ARM_MCPU_FLAG -mcpu=native)
+                    message(STATUS "ARM -mcpu not found, -mcpu=native will be used")
+                endif()
 
-                    set(TEST_I8MM_FLAGS "-march=armv8.2a+i8mm")
+                include(CheckCXXSourceRuns)
 
+                function(check_arm_feature tag code)
                     set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
-                    set(CMAKE_REQUIRED_FLAGS     "${CMAKE_REQUIRED_FLAGS} ${TEST_I8MM_FLAGS}")
-
-                    check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
-                    if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
-                        set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm")
-                        list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8)
-
-                        message(STATUS "ARM feature MATMUL_INT8 enabled")
-                    endif ()
-
+                    set(CMAKE_REQUIRED_FLAGS "${ARM_MCPU_FLAG}+${tag}")
+                    check_cxx_source_runs(
+                        "${code}"
+                        GGML_MACHINE_SUPPORTS_${tag}
+                    )
+                    if (GGML_MACHINE_SUPPORTS_${tag})
+                        set(ARM_MCPU_FLAG_FIX "${ARM_MCPU_FLAG_FIX}+${tag}" PARENT_SCOPE)
+                    else()
+                        set(ARM_MCPU_FLAG_FIX "${ARM_MCPU_FLAG_FIX}+no${tag}" PARENT_SCOPE)
+                    endif()
                     set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
+                endfunction()
 
-                    list(APPEND ARCH_FLAGS "${MARCH_FLAGS}")
-                endif ()
-            endif ()
-        else()
-            check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
-            if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
-                list(APPEND ARCH_FLAGS -mfp16-format=ieee)
-            endif()
-            if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
-                # Raspberry Pi 1, Zero
-                list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
-            endif()
-            if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
-                if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
-                    # Android armeabi-v7a
-                    list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations)
-                else()
-                    # Raspberry Pi 2
-                    list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
+                check_arm_feature(dotprod "#include <arm_neon.h>\nint main() { int8x16_t _a, _b; volatile int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }")
+                check_arm_feature(i8mm    "#include <arm_neon.h>\nint main() { int8x16_t _a, _b; volatile int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }")
+                check_arm_feature(sve     "#include <arm_sve.h>\nint main()  { svfloat32_t _a, _b; volatile svfloat32_t _c = svadd_f32_z(svptrue_b8(), _a, _b); return 0; }")
+
+                list(APPEND ARCH_FLAGS "${ARM_MCPU_FLAG}${ARM_MCPU_FLAG_FIX}")
+            else()
+                if (GGML_CPU_ARM_ARCH)
+                    list(APPEND ARCH_FLAGS -march=${GGML_CPU_ARM_ARCH})
                 endif()
             endif()
-            if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
-                # Android arm64-v8a
-                # Raspberry Pi 3, 4, Zero 2 (32-bit)
-                list(APPEND ARCH_FLAGS -mno-unaligned-access)
+
+            # show enabled features
+            if (CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
+                set(FEAT_INPUT_FILE "NUL")
+            else()
+                set(FEAT_INPUT_FILE "/dev/null")
             endif()
-            if (GGML_SVE)
-                list(APPEND ARCH_FLAGS -march=armv8.6-a+sve)
+
+            execute_process(
+                COMMAND ${CMAKE_C_COMPILER} ${ARCH_FLAGS} -dM -E -
+                INPUT_FILE ${FEAT_INPUT_FILE}
+                OUTPUT_VARIABLE ARM_FEATURE
+                RESULT_VARIABLE ARM_FEATURE_RESULT
+            )
+            if (ARM_FEATURE_RESULT)
+                message(WARNING "Failed to get ARM features")
+            else()
+                foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC)
+                    string(FIND "${ARM_FEATURE}" "__ARM_FEATURE_${feature} 1" feature_pos)
+                    if (NOT ${feature_pos} EQUAL -1)
+                        message(STATUS "ARM feature ${feature} enabled")
+                    endif()
+                endforeach()
             endif()
         endif()
     elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
             (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
-            CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$"))
+            CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64|amd64)$"))
+
+        message(STATUS "x86 detected")
+
         if (MSVC)
             # instruction set detection for MSVC only
             if (GGML_NATIVE)
@@ -339,6 +323,11 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
     target_compile_definitions(${GGML_CPU_NAME} PRIVATE ${ARCH_DEFINITIONS})
 
     if (GGML_BACKEND_DL)
+        if (GGML_NATIVE)
+            # the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE
+            message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS")
+        endif()
+
         # The feature detection code is compiled as a separate target so that
         # it can be built without the architecture flags
         # Since multiple variants of the CPU backend may be included in the same
diff --git a/ggml/src/ggml-cpu/amx/amx.cpp b/ggml/src/ggml-cpu/amx/amx.cpp
index b9074cb3aca..5ec5263ceb4 100644
--- a/ggml/src/ggml-cpu/amx/amx.cpp
+++ b/ggml/src/ggml-cpu/amx/amx.cpp
@@ -122,7 +122,7 @@ static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_ty
 }
 
 static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    void * data = aligned_alloc(TENSOR_ALIGNMENT, size);
+    void * data = ggml_aligned_malloc(size);
     if (data == NULL) {
         fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
         return NULL;
diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
index a51d1a6c595..2d79b8b611d 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
@@ -564,21 +564,21 @@ static void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
 
 #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
     if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
-        const block_q4_0x4 * b_ptr = (const block_q4_0x4 *)vx;
+        const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx;
 
         for (int c = 0; c < nc; c += ncols_interleaved) {
-            const block_q8_0 * a_ptr = (const block_q8_0 *)vy;
+            const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
             float32x4_t acc = vdupq_n_f32(0);
             for (int b = 0; b < nb; b++) {
-                int8x16_t b0 = vld1q_s8((const int8_t *)b_ptr->qs);
-                int8x16_t b1 = vld1q_s8((const int8_t *)b_ptr->qs + 16);
-                int8x16_t b2 = vld1q_s8((const int8_t *)b_ptr->qs + 32);
-                int8x16_t b3 = vld1q_s8((const int8_t *)b_ptr->qs + 48);
-                float16x4_t bd = vld1_f16((const __fp16 *)b_ptr->d);
+                int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs);
+                int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16);
+                int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32);
+                int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48);
+                float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
 
                 int8x16_t a0 = vld1q_s8(a_ptr->qs);
                 int8x16_t a1 = vld1q_s8(a_ptr->qs + qk/2);
-                float16x4_t ad = vld1_dup_f16((const __fp16 *)&a_ptr->d);
+                float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
 
                 int32x4_t ret = vdupq_n_s32(0);
 
@@ -647,72 +647,52 @@ static void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
     UNUSED(ncols_interleaved);
     UNUSED(blocklen);
 
-#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
-    if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
-        const void * b_ptr = vx;
-        const void * a_ptr = vy;
-        float * res_ptr = s;
+#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
+    if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
+        const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx;
 
-        __asm__ __volatile__(
-            "movi v2.16b, #0x4\n"
-            "movi v1.16b, #0xf0\n"
-            "add %x[b_ptr], %x[b_ptr], #0x8\n"
-            "1:"  // Column loop
-            "add x23, %x[a_ptr], #0x2\n"
-            "movi v0.16b, #0x0\n"
-            "mov x22, %x[nb]\n"
-            "2:"  // Block loop
-            "ldr q31, [%x[b_ptr], #0x0]\n"
-            "ldr q30, [%x[b_ptr], #0x10]\n"
-            "mov x21, x23\n"
-            "movi v29.4s, #0x0\n"
-            "ldr q28, [%x[b_ptr], #0x20]\n"
-            "ldr q27, [%x[b_ptr], #0x30]\n"
-            "movi v26.4s, #0x0\n"
-            "sub x20, x23, #0x2\n"
-            "ld1r { v25.8h }, [x20]\n"
-            "ldr q24, [%x[b_ptr], #-0x8]\n"
-            "sub x22, x22, #0x1\n"
-            "add x23, x23, #0x22\n"
-            "ld1r { v23.2d }, [x21], #0x8\n"
-            "sshl v22.16b, v31.16b, v2.16b\n"
-            "sshl v16.16b, v30.16b, v2.16b\n"
-            "add %x[b_ptr], %x[b_ptr], #0x48\n"
-            "ld1r { v21.2d }, [x21], #0x8\n"
-            "sshl v20.16b, v28.16b, v2.16b\n"
-            "sshl v19.16b, v27.16b, v2.16b\n"
-            "ld1r { v18.2d }, [x21], #0x8\n"
-            "ld1r { v17.2d }, [x21], #0x8\n"
-            "and v31.16b, v31.16b, v1.16b\n"
-            "and v30.16b, v30.16b, v1.16b\n"
-            ".inst 0x4e9796dd  // sdot v29.4s, v22.16b, v23.16b\n"
-            ".inst 0x4e97961a  // sdot v26.4s, v16.16b, v23.16b\n"
-            "and v28.16b, v28.16b, v1.16b\n"
-            "and v27.16b, v27.16b, v1.16b\n"
-            "fcvtl v25.4s, v25.4h\n"
-            "fcvtl v16.4s, v24.4h\n"
-            ".inst 0x4e95969d  // sdot v29.4s, v20.16b, v21.16b\n"
-            ".inst 0x4e95967a  // sdot v26.4s, v19.16b, v21.16b\n"
-            "fmul v16.4s, v16.4s, v25.4s\n"
-            ".inst 0x4e9297fd  // sdot v29.4s, v31.16b, v18.16b\n"
-            ".inst 0x4e9297da  // sdot v26.4s, v30.16b, v18.16b\n"
-            ".inst 0x4e91979d  // sdot v29.4s, v28.16b, v17.16b\n"
-            ".inst 0x4e91977a  // sdot v26.4s, v27.16b, v17.16b\n"
-            "addp v29.4s, v29.4s, v26.4s\n"
-            "scvtf v29.4s, v29.4s, #0x4\n"
-            "fmla v0.4s, v29.4s, v16.4s\n"
-            "cbnz x22, 2b\n"
-            "sub %x[nc], %x[nc], #0x4\n"
-            "str q0, [%x[res_ptr], #0x0]\n"
-            "add %x[res_ptr], %x[res_ptr], #0x10\n"
-            "cbnz %x[nc], 1b\n"
-            : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc)
-            : [a_ptr] "r" (a_ptr), [nb] "r" (nb)
-            : "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23"
-        );
+        for (int c = 0; c < nc; c += ncols_interleaved) {
+            const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+            float32x4_t acc = vdupq_n_f32(0);
+            for (int b = 0; b < nb; b++) {
+                int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs);
+                int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16);
+                int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32);
+                int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48);
+                float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
+
+                int8x16_t a0 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs);
+                int8x16_t a1 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 1);
+                int8x16_t a2 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 2);
+                int8x16_t a3 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 3);
+                float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
+
+                int32x4_t ret0 = vdupq_n_s32(0);
+                int32x4_t ret1 = vdupq_n_s32(0);
+
+                ret0 = vdotq_s32(ret0, b0 << 4, a0);
+                ret1 = vdotq_s32(ret1, b1 << 4, a0);
+                ret0 = vdotq_s32(ret0, b2 << 4, a1);
+                ret1 = vdotq_s32(ret1, b3 << 4, a1);
+
+                ret0 = vdotq_s32(ret0, b0 & 0xf0U, a2);
+                ret1 = vdotq_s32(ret1, b1 & 0xf0U, a2);
+                ret0 = vdotq_s32(ret0, b2 & 0xf0U, a3);
+                ret1 = vdotq_s32(ret1, b3 & 0xf0U, a3);
+
+                int32x4_t ret = vpaddq_s32(ret0, ret1);
+
+                acc = vfmaq_f32(acc, vcvtq_n_f32_s32(ret, 4),
+                        vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd)));
+                a_ptr++;
+                b_ptr++;
+            }
+            vst1q_f32(s, acc);
+            s += ncols_interleaved;
+        }
         return;
     }
-#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
+#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
     float sumf[4];
     int sumi;
 
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index ea17d6077e7..b7fefb9ddfd 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -126,8 +126,7 @@ struct ggml_arm_arch_features_type {
 #endif
 #include <windows.h>
 
-
-#if !defined(__clang__)
+#if defined(_MSC_VER) && !defined(__clang__)
 #define GGML_CACHE_ALIGN __declspec(align(GGML_CACHE_LINE))
 
 typedef volatile LONG atomic_int;
@@ -455,21 +454,21 @@ const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type
 #define GGML_F32x4_ADD          vaddq_f32
 #define GGML_F32x4_MUL          vmulq_f32
 #define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
-#define GGML_F32x4_REDUCE(res, x)                  \
-{                                                  \
-    int offset = GGML_F32_ARR >> 1;                \
-    for (int i = 0; i < offset; ++i) {             \
-        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
-    }                                              \
-    offset >>= 1;                                  \
-    for (int i = 0; i < offset; ++i) {             \
-        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
-    }                                              \
-    offset >>= 1;                                  \
-    for (int i = 0; i < offset; ++i) {             \
-        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
-    }                                              \
-    (res) = GGML_F32x4_REDUCE_ONE((x)[0]);         \
+#define GGML_F32x4_REDUCE(res, x)                       \
+{                                                       \
+    int offset = GGML_F32_ARR >> 1;                     \
+    for (int i = 0; i < offset; ++i) {                  \
+        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]);      \
+    }                                                   \
+    offset >>= 1;                                       \
+    for (int i = 0; i < offset; ++i) {                  \
+        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]);      \
+    }                                                   \
+    offset >>= 1;                                       \
+    for (int i = 0; i < offset; ++i) {                  \
+        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]);      \
+    }                                                   \
+    (res) = (ggml_float) GGML_F32x4_REDUCE_ONE((x)[0]); \
 }
 
 #define GGML_F32_VEC        GGML_F32x4
@@ -987,7 +986,7 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
 #define GGML_F16_STEP 32
 #define GGML_F16_EPR  4
 
-static inline __m128 __sse_f16x4_load(ggml_fp16_t *x) {
+static inline __m128 __sse_f16x4_load(const ggml_fp16_t * x) {
     float tmp[4];
 
     tmp[0] = GGML_FP16_TO_FP32(x[0]);
@@ -998,7 +997,7 @@ static inline __m128 __sse_f16x4_load(ggml_fp16_t *x) {
     return _mm_loadu_ps(tmp);
 }
 
-static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) {
+static inline void __sse_f16x4_store(ggml_fp16_t * x, __m128 y) {
     float arr[4];
 
     _mm_storeu_ps(arr, y);
@@ -2396,7 +2395,7 @@ static void ggml_init_arm_arch_features(void) {
     uint32_t hwcap2 = getauxval(AT_HWCAP2);
 
     ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
-    ggml_arm_arch_features.has_dotprod = !!(hwcap && HWCAP_ASIMDDP);
+    ggml_arm_arch_features.has_dotprod = !!(hwcap & HWCAP_ASIMDDP);
     ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
     ggml_arm_arch_features.has_sve  = !!(hwcap & HWCAP_SVE);
 
@@ -7420,14 +7419,14 @@ static void ggml_compute_forward_mul_mat(
     if (src1_cont) {
         for (int64_t i13 = 0; i13 < ne13; i13++)
             for (int64_t i12 = 0; i12 < ne12; i12++)
-                if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
+                if (!llamafile_sgemm(params,
+                                     ne01, ne11, ne00/ggml_blck_size(src0->type),
                                      (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
                                      nb01/ggml_type_size(src0->type),
                                      (const char *)src1->data + i12*nb12 + i13*nb13,
                                      nb11/ggml_type_size(src1->type),
                                      (char *)dst->data + i12*nb2 + i13*nb3,
                                      nb1/ggml_type_size(dst->type),
-                                     ith, nth,
                                      src0->type,
                                      src1->type,
                                      dst->type))
@@ -7472,14 +7471,14 @@ UseGgmlGemm1:;
 
         for (int64_t i13 = 0; i13 < ne13; i13++)
             for (int64_t i12 = 0; i12 < ne12; i12++)
-                if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
+                if (!llamafile_sgemm(params,
+                                     ne01, ne11, ne00/ggml_blck_size(src0->type),
                                      (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
                                      nb01/ggml_type_size(src0->type),
                                      (const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
                                      row_size/ggml_type_size(vec_dot_type),
                                      (char *)dst->data + i12*nb2 + i13*nb3,
                                      nb1/ggml_type_size(dst->type),
-                                     ith, nth,
                                      src0->type,
                                      vec_dot_type,
                                      dst->type))
@@ -9134,6 +9133,64 @@ static void ggml_rope_cache_init(
     }
 }
 
+static void ggml_mrope_cache_init(
+     float theta_base_t, float theta_base_h, float theta_base_w, float theta_base_e, int sections[4], bool indep_sects,
+     float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
+     float * cache, float sin_sign, float theta_scale) {
+    // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
+    float theta_t = theta_base_t;
+    float theta_h = theta_base_h;
+    float theta_w = theta_base_w;
+    float theta_e = theta_base_e;  // extra position id for vision encoder
+    int sect_dims = sections[0] + sections[1] + sections[2] + sections[3];
+    int sec_w = sections[1] + sections[0];
+    int sec_e = sections[2] + sec_w;
+    GGML_ASSERT(sect_dims <= ne0);
+
+    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
+        const float ff = freq_factors ? freq_factors[i0/2] : 1.0f;
+
+        int sector = (i0 / 2) % sect_dims;
+        if (indep_sects) {
+            // compute theta independently for each dim sections
+            // (i.e. reset corresponding theta when `i0` go from one section to another)
+            if (sector == 0) {
+                theta_t = theta_base_t;
+            }
+            else if (sector == sections[0]) {
+                theta_h = theta_base_h;;
+            }
+            else if (sector == sec_w) {
+                theta_w = theta_base_w;
+            }
+            else if (sector == sec_e) {
+                theta_e = theta_base_e;
+            }
+        }
+
+        float theta = theta_t;
+        if (sector >= sections[0] && sector < sec_w) {
+            theta = theta_h;
+        }
+        else if (sector >= sec_w && sector < sec_w + sections[2]) {
+            theta = theta_w;
+        }
+        else if (sector >= sec_w + sections[2]) {
+            theta = theta_e;
+        }
+
+        rope_yarn(
+            theta/ff, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
+        );
+        cache[i0 + 1] *= sin_sign;
+
+        theta_t *= theta_scale;
+        theta_w *= theta_scale;
+        theta_h *= theta_scale;
+        theta_e *= theta_scale;
+    }
+}
+
 static void ggml_compute_forward_rope_f32(
         const struct ggml_compute_params * params,
         struct ggml_tensor * dst,
@@ -9144,6 +9201,7 @@ static void ggml_compute_forward_rope_f32(
     const struct ggml_tensor * src2 = dst->src[2];
 
     float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+    int sections[4];
 
     //const int n_past     = ((int32_t *) dst->op_params)[0];
     const int n_dims     = ((int32_t *) dst->op_params)[1];
@@ -9157,6 +9215,7 @@ static void ggml_compute_forward_rope_f32(
     memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
     memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
     memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
+    memcpy(&sections,    (int32_t *) dst->op_params + 11, sizeof(int)*4);
 
     GGML_TENSOR_UNARY_OP_LOCALS
 
@@ -9189,6 +9248,16 @@ static void ggml_compute_forward_rope_f32(
     ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
 
     const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
+    const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;  // ggml_rope_multi, multimodal rotary position embedding
+    const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
+
+    if (is_mrope) {
+        GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
+    }
+
+    if (is_vision) {
+        GGML_ASSERT(n_dims == ne0/2);
+    }
 
     const float * freq_factors = NULL;
     if (src2 != NULL) {
@@ -9204,18 +9273,63 @@ static void ggml_compute_forward_rope_f32(
 
     const int32_t * pos = (const int32_t *) src1->data;
 
-    for (int64_t i3 = 0; i3 < ne3; i3++) {
-        for (int64_t i2 = 0; i2 < ne2; i2++) {
-            const int64_t p = pos[i2];
+    for (int64_t i3 = 0; i3 < ne3; i3++) { // batch
+        for (int64_t i2 = 0; i2 < ne2; i2++) { // seq-len
 
             float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
-            ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
+            if (!is_mrope) {
+                const int64_t p = pos[i2];
+                ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
+            }
+            else {
+                const int64_t p_t = pos[i2];
+                const int64_t p_h = pos[i2 + ne2];
+                const int64_t p_w = pos[i2 + ne2 * 2];
+                const int64_t p_e = pos[i2 + ne2 * 3];
+                ggml_mrope_cache_init(
+                    p_t, p_h, p_w, p_e, sections, is_vision,
+                    freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
+            }
 
-            for (int64_t i1 = 0; i1 < ne1; i1++) {
+            for (int64_t i1 = 0; i1 < ne1; i1++) { // attn-heads
                 if (ir++ < ir0) continue;
                 if (ir   > ir1) break;
 
-                if (!is_neox) {
+                if (is_neox || is_mrope) {
+                    if (is_vision){
+                        for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
+                            const int64_t ic = i0/2;
+
+                            const float cos_theta = cache[i0 + 0];
+                            const float sin_theta = cache[i0 + 1];
+
+                            const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
+                            float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
+
+                            const float x0 = src[0];
+                            const float x1 = src[n_dims];
+
+                            dst_data[0]      = x0*cos_theta - x1*sin_theta;
+                            dst_data[n_dims] = x0*sin_theta + x1*cos_theta;
+                        }
+                    } else {
+                        for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
+                            const int64_t ic = i0/2;
+
+                            const float cos_theta = cache[i0 + 0];
+                            const float sin_theta = cache[i0 + 1];
+
+                            const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
+                            float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
+
+                            const float x0 = src[0];
+                            const float x1 = src[n_dims/2];
+
+                            dst_data[0]        = x0*cos_theta - x1*sin_theta;
+                            dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
+                        }
+                    }
+                } else {
                     for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
                         const float cos_theta = cache[i0 + 0];
                         const float sin_theta = cache[i0 + 1];
@@ -9229,8 +9343,10 @@ static void ggml_compute_forward_rope_f32(
                         dst_data[0] = x0*cos_theta - x1*sin_theta;
                         dst_data[1] = x0*sin_theta + x1*cos_theta;
                     }
-                } else {
-                    for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
+                }
+
+                if (is_vision) {
+                    for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
                         const int64_t ic = i0/2;
 
                         const float cos_theta = cache[i0 + 0];
@@ -9240,19 +9356,20 @@ static void ggml_compute_forward_rope_f32(
                         float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
 
                         const float x0 = src[0];
-                        const float x1 = src[n_dims/2];
+                        const float x1 = src[n_dims];
 
-                        dst_data[0]        = x0*cos_theta - x1*sin_theta;
-                        dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
+                        dst_data[0]      = x0*cos_theta - x1*sin_theta;
+                        dst_data[n_dims] = x0*sin_theta + x1*cos_theta;
                     }
-                }
-
-                for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
-                    const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                    float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+                } else {
+                    // fill the remain channels with data from src tensor
+                    for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
+                        const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+                        float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
 
-                    dst_data[0] = src[0];
-                    dst_data[1] = src[1];
+                        dst_data[0] = src[0];
+                        dst_data[1] = src[1];
+                    }
                 }
             }
         }
@@ -9270,6 +9387,7 @@ static void ggml_compute_forward_rope_f16(
     const struct ggml_tensor * src2 = dst->src[2];
 
     float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+    int sections[4];
 
     //const int n_past     = ((int32_t *) dst->op_params)[0];
     const int n_dims     = ((int32_t *) dst->op_params)[1];
@@ -9282,6 +9400,8 @@ static void ggml_compute_forward_rope_f16(
     memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
     memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
     memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
+    memcpy(&sections,    (int32_t *) dst->op_params + 11, sizeof(int)*4);
+
 
     GGML_TENSOR_UNARY_OP_LOCALS
 
@@ -9314,6 +9434,16 @@ static void ggml_compute_forward_rope_f16(
     ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
 
     const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
+    const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
+    const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
+
+    if (is_mrope) {
+        GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
+    }
+
+    if (is_vision) {
+        GGML_ASSERT(n_dims == ne0/2);
+    }
 
     const float * freq_factors = NULL;
     if (src2 != NULL) {
@@ -9331,16 +9461,61 @@ static void ggml_compute_forward_rope_f16(
 
     for (int64_t i3 = 0; i3 < ne3; i3++) {
         for (int64_t i2 = 0; i2 < ne2; i2++) {
-            const int64_t p = pos[i2];
 
             float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
-            ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
+            if (!is_mrope) {
+                const int64_t p = pos[i2];
+                ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
+            }
+            else {
+                const int64_t p_t = pos[i2];
+                const int64_t p_h = pos[i2 + ne2];
+                const int64_t p_w = pos[i2 + ne2 * 2];
+                const int64_t p_e = pos[i2 + ne2 * 3];
+                ggml_mrope_cache_init(
+                    p_t, p_h, p_w, p_e, sections, is_vision,
+                    freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
+            }
 
             for (int64_t i1 = 0; i1 < ne1; i1++) {
                 if (ir++ < ir0) continue;
                 if (ir   > ir1) break;
 
-                if (!is_neox) {
+                if (is_neox || is_mrope) {
+                    if (is_vision) {
+                        for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
+                            const int64_t ic = i0/2;
+
+                            const float cos_theta = cache[i0 + 0];
+                            const float sin_theta = cache[i0 + 1];
+
+                            const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
+                            ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
+
+                            const float x0 = GGML_FP16_TO_FP32(src[0]);
+                            const float x1 = GGML_FP16_TO_FP32(src[n_dims]);
+
+                            dst_data[0]      = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
+                            dst_data[n_dims] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
+                        }
+                    } else {
+                        for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
+                            const int64_t ic = i0/2;
+
+                            const float cos_theta = cache[i0 + 0];
+                            const float sin_theta = cache[i0 + 1];
+
+                            const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
+                            ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
+
+                            const float x0 = GGML_FP16_TO_FP32(src[0]);
+                            const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
+
+                            dst_data[0]        = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
+                            dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
+                        }
+                    }
+                } else {
                     for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
                         const float cos_theta = cache[i0 + 0];
                         const float sin_theta = cache[i0 + 1];
@@ -9354,8 +9529,10 @@ static void ggml_compute_forward_rope_f16(
                         dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
                         dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
                     }
-                } else {
-                    for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
+                }
+
+                if (is_vision) {
+                    for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
                         const int64_t ic = i0/2;
 
                         const float cos_theta = cache[i0 + 0];
@@ -9365,19 +9542,19 @@ static void ggml_compute_forward_rope_f16(
                         ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
 
                         const float x0 = GGML_FP16_TO_FP32(src[0]);
-                        const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
+                        const float x1 = GGML_FP16_TO_FP32(src[n_dims]);
 
-                        dst_data[0]        = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
-                        dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
+                        dst_data[0]      = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
+                        dst_data[n_dims] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
                     }
-                }
-
-                for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
-                    const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                    ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+                } else {
+                    for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
+                        const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+                        ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
 
-                    dst_data[0] = src[0];
-                    dst_data[1] = src[1];
+                        dst_data[0] = src[0];
+                        dst_data[1] = src[1];
+                    }
                 }
             }
         }
@@ -12945,7 +13122,7 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data);
 #include "windows.h"
 
 // TODO: support > 64 CPUs
-bool ggml_thread_apply_affinity(bool * mask) {
+static bool ggml_thread_apply_affinity(bool * mask) {
     HANDLE    h = GetCurrentThread();
     uint64_t  bitmask = 0ULL;
 
diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
index c390957afa8..f11399cc628 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -394,8 +394,11 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
     switch (op->op) {
         case GGML_OP_CPY:
             return
+                op->type != GGML_TYPE_IQ3_XXS &&
+                op->type != GGML_TYPE_IQ3_S   &&
                 op->type != GGML_TYPE_IQ2_XXS &&
                 op->type != GGML_TYPE_IQ2_XS  &&
+                op->type != GGML_TYPE_IQ2_S   &&
                 op->type != GGML_TYPE_IQ1_S   &&
                 op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
         case GGML_OP_MUL_MAT:
@@ -519,6 +522,12 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
         if (ggml_cpu_has_sve()) {
             features.push_back({ "SVE", "1" });
         }
+        if (ggml_cpu_has_dotprod()) {
+            features.push_back({ "DOTPROD", "1" });
+        }
+        if (ggml_cpu_has_matmul_int8()) {
+            features.push_back({ "MATMUL_INT8", "1" });
+        }
         if (ggml_cpu_get_sve_cnt() > 0) {
             static std::string sve_cnt = std::to_string(ggml_cpu_get_sve_cnt());
             features.push_back({ "SVE_CNT", sve_cnt.c_str() });
diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
index da4146ec4f6..00f7f11704e 100644
--- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp
+++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
@@ -53,6 +53,8 @@
 #include "ggml-cpu-impl.h"
 #include "ggml-quants.h"
 
+#include <atomic>
+
 #ifdef _MSC_VER
 #define NOINLINE __declspec(noinline)
 #else
@@ -134,6 +136,16 @@ inline __m512 madd(__m512 a, __m512 b, __m512 c) {
     return _mm512_fmadd_ps(a, b, c);
 }
 #endif
+#if defined(__AVX512BF16__)
+template <>
+inline __m512 madd(__m512bh a, __m512bh b, __m512 c) {
+    return _mm512_dpbf16_ps(c, a, b);
+}
+template <>
+inline __m256 madd(__m256bh a, __m256bh b, __m256 c) {
+    return _mm256_dpbf16_ps(c, a, b);
+}
+#endif
 #endif
 
 #if defined(__ARM_FEATURE_FMA)
@@ -204,6 +216,7 @@ template <> inline float32x4_t load(const float *p) {
     return vld1q_f32(p);
 }
 #if !defined(_MSC_VER)
+// FIXME: this should check for __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 template <> inline float16x8_t load(const ggml_fp16_t *p) {
     return vld1q_f16((const float16_t *)p);
 }
@@ -225,6 +238,13 @@ template <> inline __m256 load(const float *p) {
 }
 #endif // __AVX__
 
+#if defined(__AVX2__) || defined(__AVX512F__)
+template <> inline __m256 load(const ggml_bf16_t *p) {
+    return _mm256_castsi256_ps(
+        _mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)p)), 16));
+}
+#endif // __AVX2__
+
 #if defined(__F16C__)
 template <> inline __m256 load(const ggml_fp16_t *p) {
     return _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)p));
@@ -238,8 +258,27 @@ template <> inline __m512 load(const float *p) {
 template <> inline __m512 load(const ggml_fp16_t *p) {
     return _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)p));
 }
+template <> inline __m512 load(const ggml_bf16_t *p) {
+    return _mm512_castsi512_ps(
+        _mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)p)), 16));
+}
 #endif // __AVX512F__
 
+#if defined(__AVX512BF16__)
+template <> inline __m512bh load(const ggml_bf16_t *p) {
+    return (__m512bh)_mm512_loadu_ps((const float *)p);
+}
+template <> inline __m256bh load(const ggml_bf16_t *p) {
+    return (__m256bh)_mm256_loadu_ps((const float *)p);
+}
+template <> inline __m512bh load(const float *p) {
+    return _mm512_cvtne2ps_pbh(_mm512_loadu_ps(p + 16), _mm512_loadu_ps(p));
+}
+template <> inline __m256bh load(const float *p) {
+    return _mm512_cvtneps_pbh(_mm512_loadu_ps(p));
+}
+#endif
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // CONSTANTS
 
@@ -251,199 +290,170 @@ static const __m128i iq4nlt = _mm_loadu_si128((const __m128i *) kvalues_iq4nl);
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // FLOATING POINT MATRIX MULTIPLICATION
 
+template <int M>
+static inline int64_t BLOCK_SIZE(size_t m) {
+    const int64_t NB_BLOC_M = (m + M - 1) / M;
+    return (m % NB_BLOC_M == 0) ? m / NB_BLOC_M : (m / NB_BLOC_M) + 1;
+}
+
+static constexpr inline int64_t BLOC_POS(int64_t ib, int64_t ibN, int64_t bloc_size) {
+    return ib < ibN ? ib * bloc_size : ibN * bloc_size + (ib - ibN) * (bloc_size - 1);
+}
+
 template <int KN, typename D, typename V, typename TA, typename TB, typename TC>
 class tinyBLAS {
   public:
-    tinyBLAS(int64_t k,
+    tinyBLAS(const ggml_compute_params * params, int64_t k,
              const TA *A, int64_t lda,
              const TB *B, int64_t ldb,
-             TC *C, int64_t ldc,
-             int ith, int nth)
-        : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
+             TC *C, int64_t ldc)
+        : params(params), A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc) {
     }
 
-    void matmul(int64_t m, int64_t n) {
-        mnpack(0, m, 0, n);
+    bool matmul(int64_t m, int64_t n) {
+        if (k % KN != 0)
+            return false;
+        // compute RM for only need tile with size RM&RM-1
+#if VECTOR_REGISTERS == 32
+        if (m % 16 == 0 && (m/16 >= params->nth)) {
+            const int64_t SIZE_N = BLOCK_SIZE<6>(n);
+            mnpack<4, 6, 4>(m, n, SIZE_N, 12);
+            return true;
+        }
+        if (m % 8 == 0 ) {
+            const int64_t SIZE_N = BLOCK_SIZE<6>(n);
+            mnpack<4, 6, 2>(m, n, SIZE_N, 12);
+            return true;
+        }
+        if (m % 4 == 0) {
+            const int64_t SIZE_N = BLOCK_SIZE<6>(n);
+            mnpack<4, 6, 1>(m, n, SIZE_N, 12);
+            return true;
+        }
+#else  // VECTOR_REGISTERS == 16
+        if (m % 16 == 0 && (m/16 >= params->nth)) {
+            const int64_t SIZE_N = BLOCK_SIZE<3>(n);
+            mnpack<4, 3, 4>(m, n, SIZE_N, 24);
+            return true;
+        }
+        if (m % 8 == 0 ) {
+            const int64_t SIZE_N = BLOCK_SIZE<3>(n);
+            mnpack<4, 3, 2>(m, n, SIZE_N, 24);
+            return true;
+        }
+        if (m % 4 == 0) {
+            const int64_t SIZE_N = BLOCK_SIZE<3>(n);
+            mnpack<4, 3, 1>(m, n, SIZE_N, 24);
+            return true;
+        }
+#endif
+        return false;
     }
 
   private:
-    NOINLINE void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
-        int64_t mc, nc, mp, np;
-        switch ((MIN(m - m0, 5) << 4) | MIN(n - n0, 5)) {
-#if VECTOR_REGISTERS == 32
-        case 0x55:
-            mc = 5;
-            nc = 5;
-            gemm<5, 5>(m0, m, n0, n);
-            break;
-        case 0x45:
-            mc = 4;
-            nc = 5;
-            gemm<4, 5>(m0, m, n0, n);
-            break;
-        case 0x54:
-            mc = 5;
-            nc = 4;
-            gemm<5, 4>(m0, m, n0, n);
-            break;
-        case 0x44:
-            mc = 4;
-            nc = 4;
-            gemm<4, 4>(m0, m, n0, n);
-            break;
-        case 0x53:
-            mc = 5;
-            nc = 3;
-            gemm<5, 3>(m0, m, n0, n);
-            break;
-        case 0x35:
-            mc = 3;
-            nc = 5;
-            gemm<3, 5>(m0, m, n0, n);
-            break;
-        case 0x43:
-            mc = 4;
-            nc = 3;
-            gemm<4, 3>(m0, m, n0, n);
-            break;
-#else
-        case 0x55:
-        case 0x54:
-        case 0x53:
-        case 0x45:
-        case 0x44:
-        case 0x43:
-            mc = 4;
-            nc = 3;
-            gemm<4, 3>(m0, m, n0, n);
-            break;
-        case 0x35:
-#endif
-        case 0x34:
-            mc = 3;
-            nc = 4;
-            gemm<3, 4>(m0, m, n0, n);
-            break;
-        case 0x52:
-            mc = 5;
-            nc = 2;
-            gemm<5, 2>(m0, m, n0, n);
-            break;
-        case 0x33:
-            mc = 3;
-            nc = 3;
-            gemm<3, 3>(m0, m, n0, n);
-            break;
-        case 0x25:
-            mc = 2;
-            nc = 5;
-            gemm<2, 5>(m0, m, n0, n);
-            break;
-        case 0x42:
-            mc = 4;
-            nc = 2;
-            gemm<4, 2>(m0, m, n0, n);
-            break;
-        case 0x24:
-            mc = 2;
-            nc = 4;
-            gemm<2, 4>(m0, m, n0, n);
-            break;
-        case 0x32:
-            mc = 3;
-            nc = 2;
-            gemm<3, 2>(m0, m, n0, n);
-            break;
-        case 0x23:
-            mc = 2;
-            nc = 3;
-            gemm<2, 3>(m0, m, n0, n);
-            break;
-        case 0x51:
-            mc = 5;
-            nc = 1;
-            gemm<5, 1>(m0, m, n0, n);
-            break;
-        case 0x41:
-            mc = 4;
-            nc = 1;
-            gemm<4, 1>(m0, m, n0, n);
-            break;
-        case 0x22:
-            mc = 2;
-            nc = 2;
-            gemm<2, 2>(m0, m, n0, n);
-            break;
-        case 0x15:
-            mc = 1;
-            nc = 5;
-            gemm<1, 5>(m0, m, n0, n);
-            break;
-        case 0x14:
-            mc = 1;
-            nc = 4;
-            gemm<1, 4>(m0, m, n0, n);
-            break;
-        case 0x31:
-            mc = 3;
-            nc = 1;
-            gemm<3, 1>(m0, m, n0, n);
-            break;
-        case 0x13:
-            mc = 1;
-            nc = 3;
-            gemm<1, 3>(m0, m, n0, n);
-            break;
-        case 0x21:
-            mc = 2;
-            nc = 1;
-            gemm<2, 1>(m0, m, n0, n);
-            break;
-        case 0x12:
-            mc = 1;
-            nc = 2;
-            gemm<1, 2>(m0, m, n0, n);
-            break;
-        case 0x11:
-            mc = 1;
-            nc = 1;
-            gemm<1, 1>(m0, m, n0, n);
-            break;
-        default:
-            return;
+    template <int RM, int RN, int BM>
+    inline void mnpack(int64_t m, int64_t n, int64_t SIZE_N, int64_t BN) {
+        if (SIZE_N == RN) {
+            return gemm<RM, RN, BM>(m, n, BN);
+        }
+        if constexpr (RN > 1) {
+            return mnpack<RM, RN-1, BM>(m, n, SIZE_N, BN);
+        } else {
+            GGML_LOG_ERROR("mnpack<%d, %d> bloc size not supported\n", RM, (int)SIZE_N);
+            GGML_ASSERT(false); // we have miss something.
         }
-        mp = m0 + (m - m0) / mc * mc;
-        np = n0 + (n - n0) / nc * nc;
-        mnpack(mp, m, n0, np);
-        mnpack(m0, m, np, n);
     }
 
     template <int RM, int RN>
-    NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
-        int64_t ytiles = (m - m0) / RM;
-        int64_t xtiles = (n - n0) / RN;
-        int64_t tiles = xtiles * ytiles;
-        int64_t duty = (tiles + nth - 1) / nth;
-        int64_t start = duty * ith;
-        int64_t end = start + duty;
-        if (end > tiles)
-            end = tiles;
-        for (int64_t job = start; job < end; ++job) {
-            int64_t ii = m0 + job / xtiles * RM;
-            int64_t jj = n0 + job % xtiles * RN;
-            D Cv[RN][RM] = {};
-            for (int64_t l = 0; l < k; l += KN)
-                for (int64_t j = 0; j < RN; ++j)
-                    for (int64_t i = 0; i < RM; ++i)
-                        Cv[j][i] = madd(load<V>(A + lda * (ii + i) + l),
-                                        load<V>(B + ldb * (jj + j) + l),
-                                        Cv[j][i]);
-            for (int64_t j = 0; j < RN; ++j)
-                for (int64_t i = 0; i < RM; ++i)
-                    C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
+    inline void gemm_bloc(int64_t ii, int64_t jj) {
+        D Cv[RN][RM] = {};
+        for (int64_t l = 0; l < k; l += KN) {
+            // help compiler for op order.
+            if constexpr (RM <= RN) {
+                V Av[RM];
+                for (int64_t i = 0; i < RM; ++i) {
+                    Av[i] = load<V>(A + lda * (ii + i) + l);
+                }
+                for (int64_t j = 0; j < RN; ++j) {
+                    V Bv = load<V>(B + ldb * (jj + j) + l);
+                    for (int64_t i = 0; i < RM; ++i) {
+                        Cv[j][i] = madd(Av[i], Bv, Cv[j][i]);
+                    }
+                }
+            } else {
+                V Bv[RN];
+                for (int64_t j = 0; j < RN; ++j) {
+                    Bv[j] = load<V>(B + ldb * (jj + j) + l);
+                }
+                for (int64_t i = 0; i < RM; ++i) {
+                    V Av = load<V>(A + lda * (ii + i) + l);
+                    for (int64_t j = 0; j < RN; ++j) {
+                        Cv[j][i] = madd(Av, Bv[j], Cv[j][i]);
+                    }
+                }
+            }
         }
+        for (int64_t j = 0; j < RN; ++j)
+            for (int64_t i = 0; i < RM; ++i)
+                C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
     }
 
+    template <int RM, int RN, int BM>
+    NOINLINE void gemm(int64_t m, int64_t n, int64_t BN) {
+        static std::atomic<int64_t> current_chunk;
+
+        GGML_ASSERT(m % (RM * BM) == 0);
+        const int64_t ytiles = m / (RM * BM);
+        const int64_t xtiles = (n + RN -1) / RN;
+        const int64_t jj_RN = (xtiles - (xtiles * RN - n));
+
+        // "round" bloc_size to "nearest" BN
+        const int64_t NB_BN = xtiles < BN ? 1 : (xtiles + BN / 2) / BN;
+        const int64_t SIZE_BN = xtiles % NB_BN == 0 ? xtiles / NB_BN : xtiles / NB_BN + 1;
+        const int64_t jj_BN = (NB_BN - (NB_BN * SIZE_BN - xtiles));
+        const int64_t nb_job = ytiles * NB_BN;
+
+        if (params->ith == 0) {
+            GGML_ASSERT( jj_BN * SIZE_BN + (NB_BN - jj_BN) * (SIZE_BN - 1) == xtiles);
+            // Every thread starts at ith, so the first unprocessed chunk is nth.  This save a bit of coordination right at the start.
+            std::atomic_store_explicit(&current_chunk, (int64_t)params->nth, std::memory_order_relaxed);
+        }
+
+        ggml_barrier(params->threadpool);
+
+        int64_t job = params->ith;
+        while (job < nb_job) {
+            const int64_t ii = (job % ytiles) * RM * BM;
+            const int64_t jb =  job / ytiles;
+            const int64_t jr0 = BLOC_POS(jb  , jj_BN, SIZE_BN);
+            const int64_t jrN = BLOC_POS(jb+1, jj_BN, SIZE_BN);
+
+            const int64_t jj0 = BLOC_POS(jr0, jj_RN, RN);
+            const int64_t jj2 = BLOC_POS(jrN, jj_RN, RN);
+            const int64_t jj1 = jj2 < jj_RN * RN ? jj2 : jj_RN * RN;
+
+            for (int64_t bi = 0; bi < BM * RM; bi += RM) {
+                int64_t jj = jj0;
+                for (; jj < jj1; jj += RN) {
+                    gemm_bloc<RM, RN>(ii + bi, jj);
+                }
+                if constexpr (RN > 1) {
+                    for (; jj < jj2; jj += RN - 1) {
+                        gemm_bloc<RM, RN-1>(ii + bi, jj);
+                    }
+                }
+                GGML_ASSERT(jj == jj2);
+            }
+
+            // next step.
+            job = std::atomic_fetch_add_explicit(&current_chunk, (int64_t)1, std::memory_order_relaxed);
+        }
+
+        ggml_barrier(params->threadpool);
+        return;
+    }
+
+    const ggml_compute_params * params;
     const TA *const A;
     const TB *const B;
     TC *const C;
@@ -451,8 +461,6 @@ class tinyBLAS {
     const int64_t lda;
     const int64_t ldb;
     const int64_t ldc;
-    const int ith;
-    const int nth;
 };
 
 //////////////////////////////////////////////////////////////////////////////////////////
@@ -1656,8 +1664,9 @@ class tinyBLAS_PPC {
  * @param Ctype is GGML data type of `C`
  * @return true if this function was able to service the matmul request
  */
-bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
-                     int64_t ldc, int ith, int nth, int Atype, int Btype, int Ctype) {
+bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64_t n, int64_t k,
+                     const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
+                     int64_t ldc, int Atype, int Btype, int Ctype) {
 
     assert(m >= 0);
     assert(n >= 0);
@@ -1665,8 +1674,8 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
     assert(lda >= k);
     assert(ldb >= k);
     assert(ldc >= m);
-    assert(nth > 0);
-    assert(ith < nth);
+    assert(params->nth > 0);
+    assert(params->ith < params->nth);
 
     // only enable sgemm for prompt processing
     if (n < 2)
@@ -1681,37 +1690,25 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
         if (Btype != GGML_TYPE_F32)
             return false;
 #if defined(__AVX512F__)
-        if (k % 16)
-            return false;
-        tinyBLAS<16, __m512, __m512, float, float, float> tb{
+        tinyBLAS<16, __m512, __m512, float, float, float> tb{ params,
             k, (const float *)A, lda,
             (const float *)B, ldb,
-            (float *)C, ldc,
-            ith, nth};
-        tb.matmul(m, n);
-        return true;
+            (float *)C, ldc};
+        return tb.matmul(m, n);
 #elif defined(__AVX__) || defined(__AVX2__)
-        if (k % 8)
-            return false;
-        tinyBLAS<8, __m256, __m256, float, float, float> tb{
+        tinyBLAS<8, __m256, __m256, float, float, float> tb{ params,
             k, (const float *)A, lda,
             (const float *)B, ldb,
-            (float *)C, ldc,
-            ith, nth};
-        tb.matmul(m, n);
-        return true;
+            (float *)C, ldc};
+        return tb.matmul(m, n);
 #elif defined(__ARM_NEON)
         if (n < 4)
             return false;
-        if (k % 4)
-            return false;
-        tinyBLAS<4, float32x4_t, float32x4_t, float, float, float> tb{
+        tinyBLAS<4, float32x4_t, float32x4_t, float, float, float> tb{ params,
             k, (const float *)A, lda,
             (const float *)B, ldb,
-            (float *)C, ldc,
-            ith, nth};
-        tb.matmul(m, n);
-        return true;
+            (float *)C, ldc};
+        return tb.matmul(m, n);
 #elif defined(__MMA__)
         if (k % 8)
             return false;
@@ -1719,7 +1716,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
             k, (const float *)A, lda,
             (const float *)B, ldb,
             (float *)C, ldc,
-            ith, nth};
+            params->ith, params->nth};
         tb.matmul(m, n);
         return true;
 #else
@@ -1727,60 +1724,71 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
 #endif
     }
 
+    case GGML_TYPE_BF16: {
+#if defined(__AVX512BF16__)
+        if (Btype == GGML_TYPE_BF16) {
+            tinyBLAS<32, __m512, __m512bh, ggml_bf16_t, ggml_bf16_t, float> tb{ params, k,
+                (const ggml_bf16_t *)A, lda,
+                (const ggml_bf16_t *)B, ldb,
+                (float *)C, ldc};
+            return tb.matmul(m, n);
+        }
+#elif defined(__AVX512F__)
+        if (Btype == GGML_TYPE_BF16) {
+            tinyBLAS<16, __m512, __m512, ggml_bf16_t, ggml_bf16_t, float> tb{ params, k,
+                (const ggml_bf16_t *)A, lda,
+                (const ggml_bf16_t *)B, ldb,
+                (float *)C, ldc};
+            return tb.matmul(m, n);
+        }
+#elif defined(__AVX2__)
+        if (Btype == GGML_TYPE_BF16) {
+            tinyBLAS<8, __m256, __m256, ggml_bf16_t, ggml_bf16_t, float> tb{ params, k,
+                (const ggml_bf16_t *)A, lda,
+                (const ggml_bf16_t *)B, ldb,
+                (float *)C, ldc};
+            return tb.matmul(m, n);
+        }
+#endif
+        return false;
+    }
     case GGML_TYPE_F16: {
 #if defined(__AVX512F__)
-        if (k % 16)
-            return false;
-        if (Btype != GGML_TYPE_F32)
-            return false;
-        tinyBLAS<16, __m512, __m512, ggml_fp16_t, float, float> tb{
-            k, (const ggml_fp16_t *)A, lda,
-            (const float *)B, ldb,
-            (float *)C, ldc,
-            ith, nth};
-        tb.matmul(m, n);
-        return true;
+        if (Btype == GGML_TYPE_F16) {
+            tinyBLAS<16, __m512, __m512, ggml_fp16_t, ggml_fp16_t, float> tb{ params, k,
+                (const ggml_fp16_t *)A, lda,
+                (const ggml_fp16_t *)B, ldb,
+                (float *)C, ldc};
+            return tb.matmul(m, n);
+        }
 #elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
-        if (k % 8)
-            return false;
-        if (Btype != GGML_TYPE_F32)
-            return false;
-        tinyBLAS<8, __m256, __m256, ggml_fp16_t, float, float> tb{
-            k, (const ggml_fp16_t *)A, lda,
-            (const float *)B, ldb,
-            (float *)C, ldc,
-            ith, nth};
-        tb.matmul(m, n);
-        return true;
+        if (Btype == GGML_TYPE_F16) {
+            tinyBLAS<8, __m256, __m256, ggml_fp16_t, ggml_fp16_t, float> tb{ params, k,
+                (const ggml_fp16_t *)A, lda,
+                (const ggml_fp16_t *)B, ldb,
+                (float *)C, ldc};
+            return tb.matmul(m, n);
+        }
 #elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
         if (n < 8)
             return false;
-        if (k % 8)
-            return false;
-        if (Btype != GGML_TYPE_F16)
-            return false;
-        tinyBLAS<8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, float> tb{
-            k, (const ggml_fp16_t *)A, lda,
-            (const ggml_fp16_t *)B, ldb,
-            (float *)C, ldc,
-            ith, nth};
-        tb.matmul(m, n);
-        return true;
+        if (Btype == GGML_TYPE_F16) {
+            tinyBLAS<8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, float> tb{ params,
+                k, (const ggml_fp16_t *)A, lda,
+                (const ggml_fp16_t *)B, ldb,
+                (float *)C, ldc};
+            return tb.matmul(m, n);
+        }
 #elif defined(__ARM_NEON) && !defined(_MSC_VER)
-        if (k % 4)
-            return false;
-        if (Btype != GGML_TYPE_F32)
-            return false;
-        tinyBLAS<4, float32x4_t, float32x4_t, ggml_fp16_t, float, float> tb{
-            k, (const ggml_fp16_t *)A, lda,
-            (const float *)B, ldb,
-            (float *)C, ldc,
-            ith, nth};
-        tb.matmul(m, n);
-        return true;
-#else
-        return false;
+        if (Btype == GGML_TYPE_F32) {
+            tinyBLAS<4, float32x4_t, float32x4_t, ggml_fp16_t, float, float> tb{ params,
+                k, (const ggml_fp16_t *)A, lda,
+                (const float *)B, ldb,
+                (float *)C, ldc};
+            return tb.matmul(m, n);
+        }
 #endif
+        return false;
     }
 
     case GGML_TYPE_Q8_0: {
@@ -1791,7 +1799,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
             k, (const block_q8_0 *)A, lda,
             (const block_q8_0 *)B, ldb,
             (float *)C, ldc,
-            ith, nth};
+            params->ith, params->nth};
         tb.matmul(m, n);
         return true;
 #elif defined(__ARM_FEATURE_DOTPROD)
@@ -1799,7 +1807,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
             k, (const block_q8_0 *)A, lda,
             (const block_q8_0 *)B, ldb,
             (float *)C, ldc,
-            ith, nth};
+            params->ith, params->nth};
         tb.matmul(m, n);
         return true;
 #else
@@ -1815,7 +1823,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
             k, (const block_q4_0 *)A, lda,
             (const block_q8_0 *)B, ldb,
             (float *)C, ldc,
-            ith, nth};
+            params->ith, params->nth};
         tb.matmul(m, n);
         return true;
 #elif defined(__ARM_FEATURE_DOTPROD)
@@ -1823,7 +1831,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
             k, (const block_q4_0 *)A, lda,
             (const block_q8_0 *)B, ldb,
             (float *)C, ldc,
-            ith, nth};
+            params->ith, params->nth};
         tb.matmul(m, n);
         return true;
 #else
@@ -1839,7 +1847,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
             k, (const block_q5_0 *)A, lda,
             (const block_q8_0 *)B, ldb,
             (float *)C, ldc,
-            ith, nth};
+            params->ith, params->nth};
         tb.matmul(m, n);
         return true;
 #else
@@ -1855,7 +1863,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
             k, (const block_iq4_nl *)A, lda,
             (const block_q8_0 *)B, ldb,
             (float *)C, ldc,
-            ith, nth};
+            params->ith, params->nth};
         tb.matmul(m, n);
         return true;
 #else
@@ -1867,6 +1875,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
         return false;
     }
 
+    (void)params;
     (void)m;
     (void)n;
     (void)k;
@@ -1876,8 +1885,6 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
     (void)ldb;
     (void)C;
     (void)ldc;
-    (void)ith;
-    (void)nth;
     (void)Atype;
     (void)Btype;
     (void)Ctype;
diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.h b/ggml/src/ggml-cpu/llamafile/sgemm.h
index caf6dd5567b..3d290951524 100644
--- a/ggml/src/ggml-cpu/llamafile/sgemm.h
+++ b/ggml/src/ggml-cpu/llamafile/sgemm.h
@@ -5,8 +5,8 @@
 extern "C" {
 #endif
 
-bool llamafile_sgemm(int64_t, int64_t, int64_t, const void *, int64_t,
-                     const void *, int64_t, void *, int64_t, int, int,
+bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t, int64_t, int64_t,
+                     const void *, int64_t, const void *, int64_t, void *, int64_t,
                      int, int, int);
 
 #ifdef __cplusplus
diff --git a/ggml/src/ggml-cuda/concat.cu b/ggml/src/ggml-cuda/concat.cu
index dac10ec36b0..2f42b8a9538 100644
--- a/ggml/src/ggml-cuda/concat.cu
+++ b/ggml/src/ggml-cuda/concat.cu
@@ -94,7 +94,9 @@ static void concat_f32_cuda(const float * x, const float * y, float * dst, int n
 }
 
 // non-contiguous kernel (slow)
-static __global__ void concat_f32_non_cont(
+template <int dim>
+static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE)
+    concat_f32_non_cont(
         const char * src0,
         const char * src1,
               char * dst,
@@ -121,22 +123,28 @@ static __global__ void concat_f32_non_cont(
           uint64_t   nb0,
           uint64_t   nb1,
           uint64_t   nb2,
-          uint64_t   nb3,
-          int32_t   dim) {
+          uint64_t   nb3){
+    static_assert(dim >= 0 && dim <= 3);
+
     const int64_t i3 = blockIdx.z;
     const int64_t i2 = blockIdx.y;
     const int64_t i1 = blockIdx.x;
 
-    int64_t o[4] = {0, 0, 0, 0};
-    o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));
-
     const float * x;
 
-    for (int i0 = threadIdx.x; i0 < ne0; i0 += blockDim.x) {
+    for (int64_t i0 = threadIdx.x; i0 < ne0; i0 += blockDim.x) {
         if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
             x = (const float *)(src0 + (i3       )*nb03 + (i2       )*nb02 + (i1       )*nb01 + (i0       )*nb00);
         } else {
-            x = (const float *)(src1 + (i3 - o[3])*nb13 + (i2 - o[2])*nb12 + (i1 - o[1])*nb11 + (i0 - o[0])*nb10);
+            if constexpr (dim == 0) {
+                x = (const float *) (src1 + i3 * nb13 + i2 * nb12 + i1 * nb11 + (i0 - ne00) * nb10);
+            } else if constexpr (dim == 1) {
+                x = (const float *) (src1 + i3 * nb13 + i2 * nb12 + (i1 - ne01) * nb11 + i0 * nb10);
+            } else if constexpr (dim == 2) {
+                x = (const float *) (src1 + i3 * nb13 + (i2 - ne02) * nb12 + i1 * nb11 + i0 * nb10);
+            } else if constexpr (dim == 3) {
+                x = (const float *) (src1 + (i3 - ne03) * nb13 + i2 * nb12 + i1 * nb11 + i0 * nb10);
+            }
         }
 
         float * y = (float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
@@ -182,15 +190,32 @@ void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
         }
     } else {
         dim3 grid_dim(dst->ne[1], dst->ne[2], dst->ne[3]);
-        concat_f32_non_cont<<<grid_dim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(
-                (const char *)src0->data,
-                (const char *)src1->data,
-                (      char *)dst->data,
+        auto launch_kernel = [&](auto dim) {
+            concat_f32_non_cont<dim><<<grid_dim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(
+                (const char *) src0->data, (const char *) src1->data, (char *) dst->data,
                 src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
                 src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
                 src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
                 src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3],
-                dst->ne[0],  dst->ne[1],  dst->ne[2],  dst->ne[3],
-                dst->nb[0],  dst->nb[1],  dst->nb[2],  dst->nb[3], dim);
+                dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
+                dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3]);
+        };
+        switch (dim) {
+            case 0:
+                launch_kernel(std::integral_constant<int, 0>{});
+                break;
+            case 1:
+                launch_kernel(std::integral_constant<int, 1>{});
+                break;
+            case 2:
+                launch_kernel(std::integral_constant<int, 2>{});
+                break;
+            case 3:
+                launch_kernel(std::integral_constant<int, 3>{});
+                break;
+            default:
+                GGML_ABORT("Invalid dim: %d", dim);
+                break;
+        }
     }
 }
diff --git a/ggml/src/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu
index 88f586d689c..2c84778d29c 100644
--- a/ggml/src/ggml-cuda/rope.cu
+++ b/ggml/src/ggml-cuda/rope.cu
@@ -4,6 +4,11 @@ struct rope_corr_dims {
     float v[2];
 };
 
+
+struct mrope_sections {
+    int v[4];
+};
+
 static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
     const float y = (i0 / 2 - low) / max(0.001f, high - low);
     return 1.0f - min(1.0f, max(0.0f, y));
@@ -108,6 +113,105 @@ static __global__ void rope_neox(
     dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
 }
 
+template<typename T, bool has_ff>
+static __global__ void rope_multi(
+    const T * x, T * dst, int ne0, int ne2, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
+    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors, mrope_sections sections) {
+    const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
+
+    if (i0 >= ne0) {
+        return;
+    }
+
+    const int row = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i0 >= n_dims) {
+        const int i = row*ne0 + i0;
+
+        dst[i + 0] = x[i + 0];
+        dst[i + 1] = x[i + 1];
+
+        return;
+    }
+
+    const int i  = row*ne0 + i0/2;
+    const int i2 = row/p_delta_rows;
+
+    int sect_dims = sections.v[0] + sections.v[1] + sections.v[2] + sections.v[3];
+    int sec_w = sections.v[1] + sections.v[0];
+    int sector = (i0 / 2) % sect_dims;
+
+    float theta_base = 0.0;
+    if (sector < sections.v[0]) {
+        theta_base = pos[i2]*powf(theta_scale, i0/2.0f);
+    }
+    else if (sector >= sections.v[0] && sector < sec_w) {
+        theta_base = pos[i2 + ne2 * 1]*powf(theta_scale, i0/2.0f);
+    }
+    else if (sector >= sec_w && sector < sec_w + sections.v[2]) {
+        theta_base = pos[i2 + ne2 * 2]*powf(theta_scale, i0/2.0f);
+    }
+    else if (sector >= sec_w + sections.v[2]) {
+        theta_base = pos[i2 + ne2 * 3]*powf(theta_scale, i0/2.0f);
+    }
+
+    const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
+
+    float cos_theta;
+    float sin_theta;
+
+    rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
+
+    const float x0 = x[i + 0];
+    const float x1 = x[i + n_dims/2];
+
+    dst[i + 0]        = x0*cos_theta - x1*sin_theta;
+    dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
+}
+
+template<typename T, bool has_ff>
+static __global__ void rope_vision(
+    const T * x, T * dst, int ne0, int ne2, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
+    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors, mrope_sections sections) {
+    const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
+
+    if (i0 >= ne0) {
+        return;
+    }
+
+    const int row = blockDim.x*blockIdx.x + threadIdx.x;
+
+    const int i  = row*ne0 + i0/2;
+    const int i2 = row/p_delta_rows; // i2-th tokens
+
+    int sect_dims = sections.v[0] + sections.v[1];
+    int sec_w = sections.v[1] + sections.v[0];
+    int sector = (i0 / 2) % sect_dims;
+
+    float theta_base = 0.0;
+    if (sector < sections.v[0]) {
+        const int p = sector;
+        theta_base = pos[i2]*powf(theta_scale, p);
+    }
+    else if (sector >= sections.v[0] && sector < sec_w) {
+        const int p = sector - sections.v[0];
+        theta_base = pos[i2 + ne2]*powf(theta_scale, p);
+    }
+
+    const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
+
+    float cos_theta;
+    float sin_theta;
+
+    rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
+
+    const float x0 = x[i + 0];
+    const float x1 = x[i + n_dims];
+
+    dst[i + 0]      = x0*cos_theta - x1*sin_theta;
+    dst[i + n_dims] = x0*sin_theta + x1*cos_theta;
+}
+
 template<typename T>
 static void rope_norm_cuda(
     const T * x, T * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
@@ -156,6 +260,56 @@ static void rope_neox_cuda(
     }
 }
 
+template<typename T>
+static void rope_multi_cuda(
+    const T * x, T * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
+    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream) {
+    GGML_ASSERT(ne0 % 2 == 0);
+    const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
+    const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
+    const dim3 block_nums(nr, n_blocks_x, 1);
+
+    const float theta_scale = powf(freq_base, -2.0f/n_dims);
+
+    if (freq_factors == nullptr) {
+        rope_multi<T, false><<<block_nums, block_dims, 0, stream>>>(
+                x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
+                theta_scale, freq_factors, sections
+                );
+    } else {
+        rope_multi<T, true><<<block_nums, block_dims, 0, stream>>>(
+                x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
+                theta_scale, freq_factors, sections
+                );
+    }
+}
+
+template<typename T>
+static void rope_vision_cuda(
+    const T * x, T * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
+    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream) {
+    GGML_ASSERT(ne0 % 2 == 0);
+    const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
+    const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
+    const dim3 block_nums(nr, n_blocks_x, 1);
+    // break down (head_dim, heads, seq) into (CUDA_ROPE_BLOCK_SIZE, x, heads * seq)
+    // where x ~= ceil(head_dim / CUDA_ROPE_BLOCK_SIZE);
+
+    const float theta_scale = powf(freq_base, -2.0f/n_dims);
+
+    if (freq_factors == nullptr) {
+        rope_vision<T, false><<<block_nums, block_dims, 0, stream>>>(
+                x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
+                theta_scale, freq_factors, sections
+                );
+    } else {
+        rope_vision<T, true><<<block_nums, block_dims, 0, stream>>>(
+                x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
+                theta_scale, freq_factors, sections
+                );
+    }
+}
+
 static void rope_norm_cuda_f16(
     const half * x, half * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
     float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
@@ -185,6 +339,38 @@ static void rope_neox_cuda_f32(
     rope_neox_cuda<float>(x, dst, ne0, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
 }
 
+static void rope_multi_cuda_f16(
+    const half * x, half * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
+    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream
+) {
+
+    rope_multi_cuda<half>(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
+}
+
+static void rope_multi_cuda_f32(
+    const float * x, float * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
+    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream
+) {
+
+    rope_multi_cuda<float>(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
+}
+
+static void rope_vision_cuda_f16(
+    const half * x, half * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
+    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream
+) {
+
+    rope_vision_cuda<half>(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
+}
+
+static void rope_vision_cuda_f32(
+    const float * x, float * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
+    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream
+) {
+
+    rope_vision_cuda<float>(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
+}
+
 void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const ggml_tensor * src0 = dst->src[0];
     const ggml_tensor * src1 = dst->src[1];
@@ -201,8 +387,9 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     GGML_ASSERT( dst->type == GGML_TYPE_F32 ||  dst->type == GGML_TYPE_F16);
     GGML_ASSERT(src0->type == dst->type);
 
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
+    const int64_t ne00 = src0->ne[0]; // head dims
+    const int64_t ne01 = src0->ne[1]; // num heads
+    const int64_t ne02 = src0->ne[2]; // num heads
     const int64_t nr = ggml_nrows(src0);
 
     //const int n_past     = ((int32_t *) dst->op_params)[0];
@@ -210,6 +397,7 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const int mode       = ((int32_t *) dst->op_params)[2];
     //const int n_ctx      = ((int32_t *) dst->op_params)[3];
     const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
+    mrope_sections sections;
 
     // RoPE alteration for extended context
     float freq_base;
@@ -225,8 +413,19 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
     memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
     memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
+    memcpy(&sections.v,  (int32_t *) dst->op_params + 11, sizeof(int)*4);
 
     const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
+    const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
+    const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
+
+    if (is_mrope) {
+        GGML_ASSERT(sections.v[0] > 0 || sections.v[1] > 0 || sections.v[2] > 0);
+    }
+
+    if (is_vision) {
+        GGML_ASSERT(n_dims == ne00/2);
+    }
 
     const int32_t * pos = (const int32_t *) src1_d;
 
@@ -253,6 +452,34 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
         } else {
             GGML_ABORT("fatal error");
         }
+    } else if (is_mrope && !is_vision) {
+        if (src0->type == GGML_TYPE_F32) {
+            rope_multi_cuda_f32(
+                (const float *)src0_d, (float *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
+                attn_factor, corr_dims, freq_factors, sections, stream
+            );
+        } else if (src0->type == GGML_TYPE_F16) {
+            rope_multi_cuda_f16(
+                (const half *)src0_d, (half *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
+                attn_factor, corr_dims, freq_factors, sections, stream
+            );
+        } else {
+            GGML_ABORT("fatal error");
+        }
+    } else if (is_vision) {
+        if (src0->type == GGML_TYPE_F32) {
+            rope_vision_cuda_f32(
+                (const float *)src0_d, (float *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
+                attn_factor, corr_dims, freq_factors, sections, stream
+            );
+        } else if (src0->type == GGML_TYPE_F16) {
+            rope_vision_cuda_f16(
+                (const half *)src0_d, (half *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
+                attn_factor, corr_dims, freq_factors, sections, stream
+            );
+        } else {
+            GGML_ABORT("fatal error");
+        }
     } else {
         if (src0->type == GGML_TYPE_F32) {
             rope_norm_cuda_f32(
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index 00a1546a7d5..549772c57c9 100644
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -74,8 +74,8 @@ static inline int ggml_up(int n, int m) {
 //
 
 GGML_ATTRIBUTE_FORMAT(2, 3)
-void ggml_log_internal        (enum ggml_log_level level, const char * format, ...);
-void ggml_log_callback_default(enum ggml_log_level level, const char * text, void * user_data);
+GGML_API void ggml_log_internal        (enum ggml_log_level level, const char * format, ...);
+GGML_API void ggml_log_callback_default(enum ggml_log_level level, const char * text, void * user_data);
 
 #define GGML_LOG(...)       ggml_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__)
 #define GGML_LOG_INFO(...)  ggml_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
@@ -304,8 +304,8 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
 
 // Memory allocation
 
-void * ggml_aligned_malloc(size_t size);
-void ggml_aligned_free(void * ptr, size_t size);
+GGML_API void * ggml_aligned_malloc(size_t size);
+GGML_API void ggml_aligned_free(void * ptr, size_t size);
 
 // FP16 to FP32 conversion
 
@@ -551,6 +551,22 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
 #define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
 #define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
 
+// expose GGUF internals for test code
+
+GGML_API size_t gguf_type_size(enum gguf_type type);
+
+GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params);
+
+struct gguf_buf {
+    void * data;
+    size_t size;
+    size_t offset;
+};
+GGML_API struct gguf_buf gguf_buf_init(size_t size);
+GGML_API void gguf_buf_free(struct gguf_buf buf);
+
+GGML_API void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/ggml/src/ggml-kompute/ggml-kompute.cpp b/ggml/src/ggml-kompute/ggml-kompute.cpp
index 28ceecfc40d..50579227183 100644
--- a/ggml/src/ggml-kompute/ggml-kompute.cpp
+++ b/ggml/src/ggml-kompute/ggml-kompute.cpp
@@ -1419,8 +1419,18 @@ static bool ggml_backend_kompute_device_supports_op(ggml_backend_dev_t dev, cons
         case GGML_OP_SOFT_MAX:
         case GGML_OP_RMS_NORM:
         case GGML_OP_NORM:
-        case GGML_OP_ROPE:
             return true;
+        case GGML_OP_ROPE:
+            {
+                const int mode = ((const int32_t *) op->op_params)[2];
+                if (mode & GGML_ROPE_TYPE_MROPE) {
+                    return false;
+                }
+                if (mode & GGML_ROPE_TYPE_VISION) {
+                    return false;
+                }
+                return true;
+            }
         case GGML_OP_DUP:
         case GGML_OP_CPY:
         case GGML_OP_CONT:
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 34fe5778e4f..28f590f9216 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -1125,8 +1125,18 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
             return has_simdgroup_reduction && (op->ne[0] % 4 == 0);
         case GGML_OP_ARGMAX:
         case GGML_OP_NORM:
-        case GGML_OP_ROPE:
             return true;
+        case GGML_OP_ROPE:
+            {
+                const int mode = ((const int32_t *) op->op_params)[2];
+                if (mode & GGML_ROPE_TYPE_MROPE) {
+                    return false;
+                }
+                if (mode & GGML_ROPE_TYPE_VISION) {
+                    return false;
+                }
+                return true;
+            }
         case GGML_OP_IM2COL:
             return op->src[0]->type == GGML_TYPE_F16;
         case GGML_OP_POOL_1D:
@@ -3026,7 +3036,9 @@ static void ggml_metal_encode_node(
             } break;
         case GGML_OP_ROPE:
             {
-                GGML_ASSERT(ne10 == ne02);
+                // make sure we have one or more position id(ne10) per token(ne02)
+                GGML_ASSERT(ne10 % ne02 == 0);
+                GGML_ASSERT(ne10 >= ne02);
 
                 const int nth = MIN(1024, ne00);
 
diff --git a/ggml/src/ggml-opencl/CMakeLists.txt b/ggml/src/ggml-opencl/CMakeLists.txt
new file mode 100644
index 00000000000..45328a65793
--- /dev/null
+++ b/ggml/src/ggml-opencl/CMakeLists.txt
@@ -0,0 +1,147 @@
+find_package(OpenCL REQUIRED)
+find_package(Python3 REQUIRED)
+
+set(TARGET_NAME ggml-opencl)
+
+ggml_add_backend_library(${TARGET_NAME}
+                         ggml-opencl.cpp
+                         ../../include/ggml-opencl.h)
+target_link_libraries(${TARGET_NAME} PRIVATE ${OpenCL_LIBRARIES})
+target_include_directories(${TARGET_NAME} PRIVATE ${OpenCL_INCLUDE_DIRS})
+
+if (GGML_OPENCL_PROFILING)
+    message(STATUS "OpenCL profiling enabled (increases CPU overhead)")
+    add_compile_definitions(GGML_OPENCL_PROFILING)
+endif ()
+
+add_compile_definitions(GGML_OPENCL_SOA_Q)
+
+if (GGML_OPENCL_USE_ADRENO_KERNELS)
+    message(STATUS "OpenCL will use matmul kernels optimized for Adreno")
+    add_compile_definitions(GGML_OPENCL_USE_ADRENO_KERNELS)
+endif ()
+
+if (GGML_OPENCL_EMBED_KERNELS)
+    add_compile_definitions(GGML_OPENCL_EMBED_KERNELS)
+
+    set(OPENCL_CL_SOURCE_EMBED         "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl.cl.h")
+    set(OPENCL_MM_CL_SOURCE_EMBED      "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_mm.cl.h")
+    set(OPENCL_CVT_CL_SOURCE_EMBED     "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_cvt.cl.h")
+
+    set(OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED             "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_gemv_noshuffle.cl.h")
+    set(OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED     "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_gemv_noshuffle_general.cl.h")
+    set(OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED          "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_mul_mat_Ab_Bi_8x4.cl.h")
+    set(OPENCL_TRANSPOSE_16_SOURCE_EMBED               "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_transpose_16.cl.h")
+    set(OPENCL_TRANSPOSE_32_SOURCE_EMBED               "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_transpose_32.cl.h")
+    set(OPENCL_TRANSPOSE_32_16_SOURCE_EMBED            "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_transpose_32_16.cl.h")
+
+    set(EMBED_KERNEL_SCRIPT             "${CMAKE_CURRENT_SOURCE_DIR}/kernels/embed_kernel.py")
+    file(MAKE_DIRECTORY                 "${CMAKE_BINARY_DIR}/autogenerated")
+
+    include_directories("${CMAKE_BINARY_DIR}/autogenerated")
+
+    # Python must be accessible from command line
+    add_custom_command(
+        OUTPUT ${OPENCL_CL_SOURCE_EMBED}
+        COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
+            ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl.cl
+            ${OPENCL_CL_SOURCE_EMBED}
+        DEPENDS kernels/ggml-opencl.cl ${EMBED_KERNEL_SCRIPT}
+        COMMENT "Generate ggml-opencl.cl.h"
+    )
+
+    add_custom_command(
+        OUTPUT ${OPENCL_MM_CL_SOURCE_EMBED}
+        COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
+            ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_mm.cl
+            ${OPENCL_MM_CL_SOURCE_EMBED}
+        DEPENDS kernels/ggml-opencl_mm.cl ${EMBED_KERNEL_SCRIPT}
+        COMMENT "Generate ggml-opencl_mm.cl.h"
+    )
+
+    add_custom_command(
+        OUTPUT ${OPENCL_CVT_CL_SOURCE_EMBED}
+        COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
+            ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_cvt.cl
+            ${OPENCL_CVT_CL_SOURCE_EMBED}
+        DEPENDS kernels/ggml-opencl_cvt.cl ${EMBED_KERNEL_SCRIPT}
+        COMMENT "Generate ggml-opencl_cvt.cl.h"
+    )
+
+    add_custom_command(
+        OUTPUT ${OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED}
+        COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
+            ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_gemv_noshuffle.cl
+            ${OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED}
+        DEPENDS kernels/ggml-opencl_gemv_noshuffle.cl ${EMBED_KERNEL_SCRIPT}
+        COMMENT "Generate ggml-opencl_gemv_noshuffle.cl.h"
+    )
+
+    add_custom_command(
+        OUTPUT ${OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED}
+        COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
+            ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_gemv_noshuffle_general.cl
+            ${OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED}
+        DEPENDS kernels/ggml-opencl_gemv_noshuffle_general.cl ${EMBED_KERNEL_SCRIPT}
+        COMMENT "Generate ggml-opencl_gemv_noshuffle_general.cl.h"
+    )
+
+    add_custom_command(
+        OUTPUT ${OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED}
+        COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
+            ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl
+            ${OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED}
+        DEPENDS kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl ${EMBED_KERNEL_SCRIPT}
+        COMMENT "Generate ggml-opencl_mul_mat_Ab_Bi_8x4.cl.cl.h"
+    )
+
+    add_custom_command(
+        OUTPUT ${OPENCL_TRANSPOSE_16_SOURCE_EMBED}
+        COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
+            ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_transpose_16.cl
+            ${OPENCL_TRANSPOSE_16_SOURCE_EMBED}
+        DEPENDS kernels/ggml-opencl_transpose_16.cl ${EMBED_KERNEL_SCRIPT}
+        COMMENT "Generate ggml-opencl_transpose_16.cl.h"
+    )
+
+    add_custom_command(
+        OUTPUT ${OPENCL_TRANSPOSE_32_SOURCE_EMBED}
+        COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
+            ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_transpose_32.cl
+            ${OPENCL_TRANSPOSE_32_SOURCE_EMBED}
+        DEPENDS kernels/ggml-opencl_transpose_32.cl ${EMBED_KERNEL_SCRIPT}
+        COMMENT "Generate ggml-opencl_transpose_32.cl.h"
+    )
+
+    add_custom_command(
+        OUTPUT ${OPENCL_TRANSPOSE_32_16_SOURCE_EMBED}
+        COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
+            ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_transpose_32_16.cl
+            ${OPENCL_TRANSPOSE_32_16_SOURCE_EMBED}
+        DEPENDS kernels/ggml-opencl_transpose_32_16.cl ${EMBED_KERNEL_SCRIPT}
+        COMMENT "Generate ggml-opencl_transpose_32_16.cl.h"
+    )
+
+    target_sources(${TARGET_NAME} PRIVATE
+                   ${OPENCL_CL_SOURCE_EMBED}
+                   ${OPENCL_MM_CL_SOURCE_EMBED}
+                   ${OPENCL_CVT_CL_SOURCE_EMBED}
+                   ${OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED}
+                   ${OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED}
+                   ${OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED}
+                   ${OPENCL_TRANSPOSE_16_SOURCE_EMBED}
+                   ${OPENCL_TRANSPOSE_32_SOURCE_EMBED}
+                   ${OPENCL_TRANSPOSE_32_16_SOURCE_EMBED})
+else ()
+    # copy ggml-opencl.cl to bin directory
+    configure_file(kernels/ggml-opencl.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl.cl COPYONLY)
+    configure_file(kernels/ggml-opencl_mm.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_mm.cl COPYONLY)
+    configure_file(kernels/ggml-opencl_cvt.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_cvt.cl COPYONLY)
+
+    configure_file(kernels/ggml-opencl_gemv_noshuffle.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_gemv_noshuffle.cl COPYONLY)
+    configure_file(kernels/ggml-opencl_gemv_noshuffle_general.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_gemv_noshuffle_general.cl COPYONLY)
+    configure_file(kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_mul_mat_Ab_Bi_8x4.cl COPYONLY)
+    configure_file(kernels/ggml-opencl_transpose_16.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_transpose_16.cl COPYONLY)
+    configure_file(kernels/ggml-opencl_transpose_32.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_transpose_32.cl COPYONLY)
+    configure_file(kernels/ggml-opencl_transpose_32_16.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_transpose_32_16.cl COPYONLY)
+endif ()
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
new file mode 100644
index 00000000000..c77d629f087
--- /dev/null
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -0,0 +1,4004 @@
+#define CL_TARGET_OPENCL_VERSION 220
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
+
+// suppress warnings in CL headers for GCC and Clang
+#pragma GCC diagnostic ignored "-Woverlength-strings"
+#ifdef __clang__
+#pragma GCC diagnostic ignored "-Wgnu-anonymous-struct"
+#endif
+
+#include "ggml-opencl.h"
+#include "ggml-backend.h"
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+#include "ggml.h"
+
+#include <CL/cl.h>
+
+#include <string.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <atomic>
+#include <fstream>
+#include <limits>
+#include <vector>
+#include <string>
+#include <cmath>
+
+#undef MIN
+#undef MAX
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+#define UNUSED(x) (void)(x)
+
+#define CL_CHECK(err)                                               \
+    do {                                                            \
+        cl_int err_ = (err);                                        \
+        if (err_ != CL_SUCCESS) {                                   \
+            GGML_LOG_ERROR("ggml_opencl: %s error %d at %s:%d\n",  \
+                #err, err_, __FILE__, __LINE__);                    \
+            GGML_ASSERT(0);                                         \
+        }                                                           \
+    } while (0)
+
+//------------------------------------------------------------------------------
+// OpenCL
+//------------------------------------------------------------------------------
+
+bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor);
+
+enum GPU_FAMILY {
+    ADRENO,
+    INTEL,
+    UNKNOWN,
+};
+
+enum ADRENO_GPU_GEN {
+    ADRENO_UNKNOWN,
+    A7X,
+    A8X,
+    X1E,
+};
+
+static ADRENO_GPU_GEN get_adreno_gpu_gen(const char *device_name) {
+    if (strstr(device_name, "730") ||
+        strstr(device_name, "740") ||
+        strstr(device_name, "750")) {
+        return ADRENO_GPU_GEN::A7X;
+    }
+
+    if (strstr(device_name, "830")) {
+        return ADRENO_GPU_GEN::A8X;
+    }
+
+    if (strstr(device_name, "X1")) {
+        return ADRENO_GPU_GEN::X1E;
+    }
+
+    return ADRENO_GPU_GEN::ADRENO_UNKNOWN;
+}
+
+static int get_adreno_cl_compiler_version(const char *driver_version) {
+    std::string driver_ver_str(driver_version);
+    size_t compiler_ver_pos = driver_ver_str.find("E031");
+    size_t compiler_ver_len = 13;
+    size_t compiler_ver_offset = 5;
+
+    if (compiler_ver_pos == std::string::npos) {
+        compiler_ver_pos = driver_ver_str.find("DX");
+        if (compiler_ver_pos == std::string::npos) {
+            return -1;
+        }
+        compiler_ver_len = 11;
+        compiler_ver_offset = 3;
+    }
+
+    std::string compiler_ver_str = driver_ver_str.substr(compiler_ver_pos, compiler_ver_len);
+    std::string major_ver_str = compiler_ver_str.substr(compiler_ver_offset, 2);
+    return std::atoi(major_ver_str.c_str());
+}
+
+// backend device context
+struct ggml_backend_opencl_device_context {
+    cl_platform_id platform;
+    std::string platform_name;
+
+    cl_device_id device;
+    std::string device_name;
+};
+
+// backend context
+struct ggml_backend_opencl_context {
+    cl_device_id device;
+    std::string device_name;
+
+    std::string driver_version;
+
+    GPU_FAMILY gpu_family;
+    ADRENO_GPU_GEN adreno_gen;
+
+    cl_int alignment;
+    size_t max_alloc_size;
+    bool fp16_support;
+
+    int adreno_wave_size;
+
+    cl_context context;
+    cl_command_queue queue;
+
+    cl_program program;
+    cl_program program_1;
+    cl_program program_2;
+
+    cl_kernel kernel_add, kernel_add_row;
+    cl_kernel kernel_mul, kernel_mul_row;
+    cl_kernel kernel_scale;
+    cl_kernel kernel_silu, kernel_silu_4;
+    cl_kernel kernel_gelu, kernel_gelu_4;
+    cl_kernel kernel_relu;
+    cl_kernel kernel_clamp;
+    cl_kernel kernel_norm;
+    cl_kernel kernel_rms_norm;
+    cl_kernel kernel_diag_mask_inf, kernel_diag_mask_inf_8;
+    cl_kernel kernel_soft_max, kernel_soft_max_4;
+    cl_kernel kernel_get_rows_f32, kernel_get_rows_f16, kernel_get_rows_q4_0;
+    cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16;
+    cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32;
+    cl_kernel kernel_mul_mat_f32_f32;
+    cl_kernel kernel_mul_mat_f16_f16;
+    cl_kernel kernel_mul_mat_f16_f32_1row;
+    cl_kernel kernel_mul_mat_f16_f32;
+    cl_kernel kernel_mul_mat_f16_f32_l4;
+    cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
+    cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0, kernel_mul_mat_q4_0_f32_flat;
+    cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
+    cl_kernel kernel_convert_block_q4_0_noshuffle, kernel_mul_mat_q4_0_f32_flat_v0,
+              kernel_mul_mat_q4_0_f32_flat_img_v0;
+    cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
+    cl_kernel kernel_mul_mv_q6_K_f32;
+
+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+    // Transpose kernels
+    cl_program program_transpose_32;
+    cl_program program_transpose_32_16;
+    cl_program program_transpose_16;
+    cl_kernel kernel_transpose_32;
+    cl_kernel kernel_transpose_32_16;
+    cl_kernel kernel_transpose_16;
+
+    cl_mem A_s_d_max;            // max scale buffer size for transpose
+    cl_mem A_q_d_max;            // max weight buffer size for transpose
+    cl_mem B_d_max;              // max activation buffer size for transpose
+
+    // Gemm and Gemv related programs, kernels, etc
+    cl_program program_CL_gemm;
+    cl_program program_CL_gemv_general;
+    cl_program program_CL_gemv_4096_1_11008;
+    cl_program program_CL_gemv_4096_1_4096;
+    cl_program program_CL_gemv_11008_1_4096;
+    cl_program program_CL_gemv_32000_1_4096;
+    cl_kernel CL_mul_mat_Ab_Bi_8x4;
+    cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general;
+    cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008;
+    cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096;
+    cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096;
+    cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096;
+#endif // GGML_OPENCL_USE_ADRENO_KERNELS
+};
+
+static ggml_backend_device                 g_ggml_backend_opencl_device;
+static ggml_backend_opencl_device_context  g_ggml_ctx_dev_main {
+    /*.platform         =*/ nullptr,
+    /*.platform_nane    =*/ "",
+    /*.device           =*/ nullptr,
+    /*.device_name      =*/ "",
+};
+
+static int ggml_backend_opencl_n_devices = 0;
+
+// Profiling
+#ifdef GGML_OPENCL_PROFILING
+struct ProfilingInfo {
+    std::string op_name;
+    std::string kernel_name;
+    // Kernel execution time in nanoseconds.
+    cl_ulong duration_ns;
+    // Global and local work sizes.
+    size_t global_size[3];
+    size_t local_size[3];
+    // Op output size.
+    size_t output_size[4];
+};
+
+std::vector<ProfilingInfo> g_profiling_info;
+#endif
+
+inline std::string read_file(const std::string &path) {
+  std::ifstream ifs(path);
+  if (!ifs) {
+    return "";
+  }
+  std::string text;
+  ifs.seekg(0, std::ios::end);
+  text.resize(ifs.tellg());
+  ifs.seekg(0, std::ios::beg);
+  ifs.read(&text[0], text.size());
+  return text;
+}
+
+static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer, const std::string &compile_opts) {
+    cl_program p;
+    char *program_log;
+    size_t program_size;
+    size_t log_size;
+    int err;
+
+    program_size = strlen(program_buffer);
+
+    p = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err);
+    if(err < 0) {
+        GGML_LOG_ERROR("OpenCL error creating program");
+        exit(1);
+    }
+
+    err = clBuildProgram(p, 0, NULL, compile_opts.c_str(), NULL, NULL);
+    if(err < 0) {
+        clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
+        program_log = (char*) malloc(log_size + 1);
+        program_log[log_size] = '\0';
+        clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL);
+        GGML_LOG_ERROR("ggml_opencl: kernel compile error:\n\n%s\n", program_log);
+        free(program_log);
+        exit(1);
+    }
+
+    return p;
+}
+
+static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
+    static bool initialized = false;
+    static ggml_backend_opencl_context *backend_ctx = nullptr;
+
+    if (initialized) {
+        return backend_ctx;
+    }
+
+    ggml_backend_opencl_device_context *dev_ctx = (ggml_backend_opencl_device_context *)dev->context;
+    GGML_ASSERT(dev_ctx);
+    GGML_ASSERT(dev_ctx->platform == nullptr);
+    GGML_ASSERT(dev_ctx->device == nullptr);
+    GGML_ASSERT(backend_ctx == nullptr);
+
+    initialized = true;
+    backend_ctx = new ggml_backend_opencl_context();
+    backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
+
+    cl_int err;
+
+#ifdef GGML_PROFILE_OPENCL
+    GGML_LOG_INFO("ggml_opencl: OpenCL profiling enabled\n");
+#endif
+
+    struct cl_device;
+    struct cl_platform {
+        cl_platform_id id;
+        unsigned number;
+        char name[128];
+        char vendor[128];
+        struct cl_device * devices;
+        unsigned n_devices;
+        struct cl_device * default_device;
+    };
+
+    struct cl_device {
+        struct cl_platform * platform;
+        cl_device_id id;
+        unsigned number;
+        cl_device_type type;
+        char name[128];
+    };
+
+    enum { NPLAT = 16, NDEV = 16 };
+
+    struct cl_platform platforms[NPLAT];
+    unsigned n_platforms = 0;
+    struct cl_device devices[NDEV];
+    unsigned n_devices = 0;
+    struct cl_device * default_device = NULL;
+
+    cl_platform_id platform_ids[NPLAT];
+    if (clGetPlatformIDs(NPLAT, platform_ids, &n_platforms) != CL_SUCCESS) {
+        GGML_LOG_ERROR("ggml_opencl: plaform IDs not available.\n");
+        return backend_ctx;
+    }
+
+    for (unsigned i = 0; i < n_platforms; i++) {
+        struct cl_platform * p = &platforms[i];
+        p->number = i;
+        p->id = platform_ids[i];
+        CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_NAME, sizeof(p->name), &p->name, NULL));
+        CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_VENDOR, sizeof(p->vendor), &p->vendor, NULL));
+
+        cl_device_id device_ids[NDEV];
+        cl_int clGetDeviceIDsError = clGetDeviceIDs(p->id, CL_DEVICE_TYPE_ALL, NDEV, device_ids, &p->n_devices);
+        if (clGetDeviceIDsError == CL_DEVICE_NOT_FOUND) {
+            p->n_devices = 0;
+        } else {
+            CL_CHECK(clGetDeviceIDsError);
+        }
+        p->devices = p->n_devices > 0 ? &devices[n_devices] : NULL;
+        p->default_device = NULL;
+
+        for (unsigned j = 0; j < p->n_devices; j++) {
+            struct cl_device * d = &devices[n_devices];
+            d->number = n_devices++;
+            d->id = device_ids[j];
+            d->platform = p;
+            CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_NAME, sizeof(d->name), &d->name, NULL));
+            CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_TYPE, sizeof(d->type), &d->type, NULL));
+
+            if (p->default_device == NULL && d->type == CL_DEVICE_TYPE_GPU) {
+                p->default_device = d;
+            }
+        }
+
+        if (default_device == NULL && p->default_device != NULL) {
+            default_device = p->default_device;
+        }
+    }
+
+    if (n_devices == 0) {
+        GGML_LOG_ERROR("ggml_opencl: could find any OpenCL devices.\n");
+        return backend_ctx;
+    }
+
+    char * user_platform_string = getenv("GGML_OPENCL_PLATFORM");
+    char * user_device_string = getenv("GGML_OPENCL_DEVICE");
+    int user_platform_number = -1;
+    int user_device_number = -1;
+
+    unsigned n;
+    if (user_platform_string != NULL && sscanf(user_platform_string, " %u", &n) == 1 && n < n_platforms) {
+        user_platform_number = (int)n;
+    }
+    if (user_device_string != NULL && sscanf(user_device_string, " %u", &n) == 1 && n < n_devices) {
+        user_device_number = (int)n;
+    }
+    if (user_platform_number != -1 && user_device_number != -1) {
+        cl_platform* platform = &platforms[user_platform_number];
+        if ((unsigned)user_device_number >= platform->n_devices) {
+            GGML_LOG_ERROR("ggml_opencl: invalid device number %d\n", user_device_number);
+            exit(1);
+        }
+        default_device = &platform->devices[user_device_number];
+    } else {
+
+        struct cl_device * selected_devices = devices;
+        unsigned n_selected_devices = n_devices;
+
+        if (user_platform_number == -1 && user_platform_string != NULL && user_platform_string[0] != 0) {
+            for (unsigned i = 0; i < n_platforms; i++) {
+                struct cl_platform * p = &platforms[i];
+                if (strstr(p->name, user_platform_string) != NULL ||
+                    strstr(p->vendor, user_platform_string) != NULL) {
+                    user_platform_number = (int)i;
+                    break;
+                }
+            }
+            if (user_platform_number == -1) {
+                GGML_LOG_ERROR("ggml_opencl: no platform matching '%s' was found.\n", user_platform_string);
+                exit(1);
+            }
+        }
+        if (user_platform_number != -1) {
+            struct cl_platform * p = &platforms[user_platform_number];
+            selected_devices = p->devices;
+            n_selected_devices = p->n_devices;
+            default_device = p->default_device;
+            if (n_selected_devices == 0) {
+                GGML_LOG_ERROR("ggml_opencl: selected platform '%s' does not have any devices.\n", p->name);
+                exit(1);
+            }
+        }
+
+        if (user_device_number == -1 && user_device_string != NULL && user_device_string[0] != 0) {
+            for (unsigned i = 0; i < n_selected_devices; i++) {
+                struct cl_device * d = &selected_devices[i];
+                if (strstr(d->name, user_device_string) != NULL) {
+                    user_device_number = d->number;
+                    break;
+                }
+            }
+            if (user_device_number == -1) {
+                GGML_LOG_ERROR("ggml_opencl: no device matching '%s' was found.\n", user_device_string);
+                exit(1);
+            }
+        }
+        if (user_device_number != -1) {
+            selected_devices = &devices[user_device_number];
+            n_selected_devices = 1;
+            default_device = &selected_devices[0];
+        }
+
+        GGML_ASSERT(n_selected_devices > 0);
+
+        if (default_device == NULL) {
+            default_device = &selected_devices[0];
+        }
+    }
+
+    GGML_LOG_INFO("ggml_opencl: selecting platform: '%s'\n", default_device->platform->name);
+    GGML_LOG_INFO("ggml_opencl: selecting device: '%s'\n", default_device->name);
+    if (default_device->type != CL_DEVICE_TYPE_GPU) {
+        GGML_LOG_WARN("ggml_opencl: warning, not a GPU: '%s'.\n", default_device->name);
+    }
+
+    dev_ctx->platform = default_device->platform->id;
+    dev_ctx->device = default_device->id;
+    backend_ctx->device = default_device->id;
+
+    if (strstr(default_device->name, "Adreno")) {
+        backend_ctx->gpu_family = GPU_FAMILY::ADRENO;
+        backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->name);
+
+        // Default wave size is 128, A8x uses 64.
+        if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::A8X) {
+            backend_ctx->adreno_wave_size = 64;
+        } else if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::A7X ||
+                   backend_ctx->adreno_gen == ADRENO_GPU_GEN::X1E) {
+            backend_ctx->adreno_wave_size = 128;
+        } else {
+            backend_ctx->adreno_wave_size = 128;
+            GGML_LOG_WARN("ggml_opencl: Unsupported Adreno GPU: %s, "
+                "using wave size %d, "
+                "may not work as expected\n",
+                backend_ctx->device_name.c_str(), backend_ctx->adreno_wave_size);
+        }
+    } else if (strstr(default_device->name, "Intel")) {
+        backend_ctx->gpu_family = GPU_FAMILY::INTEL;
+    } else {
+        GGML_LOG_ERROR("Unsupported GPU: %s\n", default_device->name);
+        backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
+        return backend_ctx;
+    }
+
+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+    if (backend_ctx->gpu_family != GPU_FAMILY::ADRENO) {
+        GGML_LOG_ERROR("ggml_opencl: Adreno-specific kernels should not be enabled for non-Adreno GPUs; "
+            "run on an Adreno GPU or recompile with CMake option `-DGGML_OPENCL_USE_ADRENO_KERNELS=OFF`\n");
+        return backend_ctx;
+    }
+#endif
+
+    // Populate backend device name
+    dev_ctx->platform_name = default_device->platform->name;
+    dev_ctx->device_name = default_device->name;
+    backend_ctx->device_name = default_device->name;
+
+    // A local ref of cl_device_id for convenience
+    cl_device_id device = backend_ctx->device;
+
+    // Check device OpenCL version, OpenCL 2.0 or above is required
+    size_t device_ver_str_size;
+    clGetDeviceInfo(device, CL_DEVICE_VERSION, 0, NULL, &device_ver_str_size);
+    char *device_ver_buffer = (char *)alloca(device_ver_str_size + 1);
+    clGetDeviceInfo(device, CL_DEVICE_VERSION, device_ver_str_size, device_ver_buffer, NULL);
+    device_ver_buffer[device_ver_str_size] = '\0';
+    GGML_LOG_INFO("ggml_opencl: device OpenCL version: %s\n", device_ver_buffer);
+
+    if (strstr(device_ver_buffer, "OpenCL 2") == NULL &&
+        strstr(device_ver_buffer, "OpenCL 3") == NULL) {
+        GGML_LOG_ERROR("ggml_opencl: OpenCL 2.0 or above is required\n");
+        return backend_ctx;
+    }
+
+    // Check driver version
+    size_t driver_version_str_size;
+    clGetDeviceInfo(device, CL_DRIVER_VERSION, 0, NULL, &driver_version_str_size);
+    char *driver_version = (char *)alloca(driver_version_str_size + 1);
+    clGetDeviceInfo(device, CL_DRIVER_VERSION, driver_version_str_size, driver_version, NULL);
+    driver_version[driver_version_str_size] = '\0';
+    GGML_LOG_INFO("ggml_opencl: OpenCL driver: %s\n", driver_version);
+    backend_ctx->driver_version = driver_version;
+
+    int adreno_cl_compiler_version = get_adreno_cl_compiler_version(driver_version);
+    bool has_vector_subgroup_broadcast =
+        adreno_cl_compiler_version >= 47 || adreno_cl_compiler_version == 17;
+    GGML_LOG_INFO("ggml_opencl: vector subgroup broadcast support: %s\n",
+        has_vector_subgroup_broadcast ? "true" : "false");
+
+    size_t ext_str_size;
+    clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size);
+    char *ext_buffer = (char *)alloca(ext_str_size + 1);
+    clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
+    ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated
+    // Check if ext_buffer contains cl_khr_fp16
+    backend_ctx->fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
+    GGML_LOG_INFO("ggml_opencl: device FP16 support: %s\n", backend_ctx->fp16_support ? "true" : "false");
+
+    // fp16 is required
+    if (!backend_ctx->fp16_support) {
+        GGML_LOG_ERROR("ggml_opencl: device does not support FP16\n");
+        return backend_ctx;
+    }
+
+    // If OpenCL 3.0 is supported, then check for cl_khr_subgroups, which becomes
+    // optional in OpenCL 3.0 (cl_khr_subgroup is mandatory in OpenCL 2.x)
+    if (strstr(device_ver_buffer, "OpenCL 3") &&
+        strstr(ext_buffer, "cl_khr_subgroups") == NULL &&
+        strstr(ext_buffer, "cl_intel_subgroups") == NULL) {
+        GGML_LOG_ERROR("ggml_opencl: device does not support subgroups (cl_khr_subgroups or cl_intel_subgroups) "
+            "(note that subgroups is an optional feature in OpenCL 3.0)\n");
+        return backend_ctx;
+    }
+
+    CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &backend_ctx->alignment, NULL));
+    GGML_LOG_INFO("ggml_opencl: mem base addr align: %u\n", backend_ctx->alignment);
+
+    clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &backend_ctx->max_alloc_size, NULL);
+    GGML_LOG_INFO("ggml_opencl: max mem alloc size: %zu MB\n", backend_ctx->max_alloc_size/1024/1024);
+
+    // Check SVM.
+    cl_device_svm_capabilities svm_caps;
+    CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_SVM_CAPABILITIES, sizeof(cl_device_svm_capabilities), &svm_caps, 0));
+    GGML_LOG_INFO("ggml_opencl: SVM coarse grain buffer support: %s\n",
+        svm_caps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER ? "true" : "false");
+    GGML_LOG_INFO("ggml_opencl: SVM fine grain buffer support: %s\n",
+        svm_caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER ? "true" : "false");
+    GGML_LOG_INFO("ggml_opencl: SVM fine grain system support: %s\n",
+        svm_caps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM ? "true" : "false");
+    GGML_LOG_INFO("ggml_opencl: SVM atomics support: %s\n",
+        svm_caps & CL_DEVICE_SVM_ATOMICS ? "true" : "false");
+
+    // Print out configurations
+#ifdef GGML_OPENCL_SOA_Q
+    GGML_LOG_INFO("ggml_opencl: flattening quantized weights representation as struct of arrays (GGML_OPENCL_SOA_Q)\n");
+#endif // GGML_OPENCL_SOA_Q
+
+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+    GGML_LOG_INFO("ggml_opencl: using kernels optimized for Adreno (GGML_OPENCL_USE_ADRENO_KERNELS)\n");
+#endif // GGML_OPENCL_USE_ADRENO_KERNELS
+
+    cl_context_properties properties[] = {
+        (intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)dev_ctx->platform, 0
+    };
+
+    CL_CHECK((backend_ctx->context = clCreateContext(properties, 1, &device, NULL, NULL, &err), err));
+
+    // A local ref of cl_context for convenience
+    cl_context context = backend_ctx->context;
+
+    //CL_CHECK((queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err),
+    //    (err != CL_INVALID_QUEUE_PROPERTIES && err != CL_INVALID_VALUE ? err :
+    //    (queue = clCreateCommandQueue(context, device, 0, &err), err)
+    //)));
+    cl_command_queue_properties command_queue_props = 0;
+#ifdef GGML_OPENCL_PROFILING
+    command_queue_props |= CL_QUEUE_PROFILING_ENABLE;
+#endif
+    CL_CHECK((backend_ctx->queue = clCreateCommandQueue(context, device, command_queue_props, &err), err));
+
+#ifdef GGML_OPENCL_EMBED_KERNELS
+    const std::string kernel_src {
+        #include "ggml-opencl.cl.h"
+    };
+#else
+    const std::string kernel_src = read_file("ggml-opencl.cl");
+#endif
+
+    std::string compile_opts =
+        "-cl-std=CL2.0 -cl-mad-enable -cl-unsafe-math-optimizations "
+        "-cl-finite-math-only -cl-fast-relaxed-math ";
+    backend_ctx->program = build_program_from_source(context, device, kernel_src.c_str(), compile_opts);
+
+    // Non matmul kernels.
+    CL_CHECK((backend_ctx->kernel_get_rows_f32       = clCreateKernel(backend_ctx->program, "kernel_get_rows_f32", &err), err));
+    CL_CHECK((backend_ctx->kernel_get_rows_f16       = clCreateKernel(backend_ctx->program, "kernel_get_rows_f16", &err), err));
+    CL_CHECK((backend_ctx->kernel_get_rows_q4_0      = clCreateKernel(backend_ctx->program, "kernel_get_rows_q4_0", &err), err));
+    CL_CHECK((backend_ctx->kernel_add                = clCreateKernel(backend_ctx->program, "kernel_add", &err), err));
+    CL_CHECK((backend_ctx->kernel_add_row            = clCreateKernel(backend_ctx->program, "kernel_add_row", &err), err));
+    CL_CHECK((backend_ctx->kernel_mul                = clCreateKernel(backend_ctx->program, "kernel_mul", &err), err));
+    CL_CHECK((backend_ctx->kernel_mul_row            = clCreateKernel(backend_ctx->program, "kernel_mul_row", &err), err));
+    CL_CHECK((backend_ctx->kernel_scale              = clCreateKernel(backend_ctx->program, "kernel_scale", &err), err));
+    CL_CHECK((backend_ctx->kernel_silu               = clCreateKernel(backend_ctx->program, "kernel_silu", &err), err));
+    CL_CHECK((backend_ctx->kernel_silu_4             = clCreateKernel(backend_ctx->program, "kernel_silu_4", &err), err));
+    CL_CHECK((backend_ctx->kernel_gelu               = clCreateKernel(backend_ctx->program, "kernel_gelu", &err), err));
+    CL_CHECK((backend_ctx->kernel_gelu_4             = clCreateKernel(backend_ctx->program, "kernel_gelu_4", &err), err));
+    CL_CHECK((backend_ctx->kernel_relu               = clCreateKernel(backend_ctx->program, "kernel_relu", &err), err));
+    CL_CHECK((backend_ctx->kernel_clamp              = clCreateKernel(backend_ctx->program, "kernel_clamp", &err), err));
+    CL_CHECK((backend_ctx->kernel_norm               = clCreateKernel(backend_ctx->program, "kernel_norm", &err), err));
+    CL_CHECK((backend_ctx->kernel_rms_norm           = clCreateKernel(backend_ctx->program, "kernel_rms_norm", &err), err));
+    CL_CHECK((backend_ctx->kernel_diag_mask_inf      = clCreateKernel(backend_ctx->program, "kernel_diag_mask_inf", &err), err));
+    CL_CHECK((backend_ctx->kernel_diag_mask_inf_8    = clCreateKernel(backend_ctx->program, "kernel_diag_mask_inf_8", &err), err));
+    CL_CHECK((backend_ctx->kernel_soft_max           = clCreateKernel(backend_ctx->program, "kernel_soft_max", &err), err));
+    CL_CHECK((backend_ctx->kernel_soft_max_4         = clCreateKernel(backend_ctx->program, "kernel_soft_max_4", &err), err));
+    CL_CHECK((backend_ctx->kernel_rope_norm_f32      = clCreateKernel(backend_ctx->program, "kernel_rope_norm_f32", &err), err));
+    CL_CHECK((backend_ctx->kernel_rope_norm_f16      = clCreateKernel(backend_ctx->program, "kernel_rope_norm_f16", &err), err));
+    CL_CHECK((backend_ctx->kernel_rope_neox_f32      = clCreateKernel(backend_ctx->program, "kernel_rope_neox_f32", &err), err));
+    CL_CHECK((backend_ctx->kernel_rope_neox_f16      = clCreateKernel(backend_ctx->program, "kernel_rope_neox_f16", &err), err));
+    CL_CHECK((backend_ctx->kernel_cpy_f16_f16        = clCreateKernel(backend_ctx->program, "kernel_cpy_f16_f16", &err), err));
+    CL_CHECK((backend_ctx->kernel_cpy_f16_f32        = clCreateKernel(backend_ctx->program, "kernel_cpy_f16_f32", &err), err));
+    CL_CHECK((backend_ctx->kernel_cpy_f32_f16        = clCreateKernel(backend_ctx->program, "kernel_cpy_f32_f16", &err), err));
+    CL_CHECK((backend_ctx->kernel_cpy_f32_f32        = clCreateKernel(backend_ctx->program, "kernel_cpy_f32_f32", &err), err));
+
+    // Matmul kernels.
+    CL_CHECK((backend_ctx->kernel_mul_mat_f32_f32        = clCreateKernel(backend_ctx->program, "kernel_mul_mat_f32_f32", &err), err));
+    CL_CHECK((backend_ctx->kernel_mul_mat_f16_f16        = clCreateKernel(backend_ctx->program, "kernel_mul_mat_f16_f16", &err), err));
+    CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_1row   = clCreateKernel(backend_ctx->program, "kernel_mul_mat_f16_f32_1row", &err), err));
+    CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32        = clCreateKernel(backend_ctx->program, "kernel_mul_mat_f16_f32", &err), err));
+    CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_l4     = clCreateKernel(backend_ctx->program, "kernel_mul_mat_f16_f32_l4", &err), err));
+    CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32       = clCreateKernel(backend_ctx->program, "kernel_mul_mat_q4_0_f32", &err), err));
+    CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_v     = clCreateKernel(backend_ctx->program, "kernel_mul_mat_q4_0_f32_v", &err), err));
+
+    CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_flat  = clCreateKernel(backend_ctx->program, "kernel_mul_mat_q4_0_f32_flat", &err), err));
+    CL_CHECK((backend_ctx->kernel_convert_block_q4_0     = clCreateKernel(backend_ctx->program, "kernel_convert_block_q4_0", &err), err));
+    CL_CHECK((backend_ctx->kernel_restore_block_q4_0     = clCreateKernel(backend_ctx->program, "kernel_restore_block_q4_0", &err), err));
+    CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_8x_flat = clCreateKernel(backend_ctx->program, "kernel_mul_mat_q4_0_f32_8x_flat", &err), err));
+
+    // Load additional mulmat kernels.
+#ifdef GGML_OPENCL_EMBED_KERNELS
+    const std::string kernel_src_1 {
+        #include "ggml-opencl_mm.cl.h"
+    };
+#else
+    const std::string kernel_src_1 = read_file("ggml-opencl_mm.cl");
+#endif
+    backend_ctx->program_1 = build_program_from_source(context, device, kernel_src_1.c_str(), compile_opts);
+
+    CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_1d_8x_flat      = clCreateKernel(backend_ctx->program_1, "kernel_mul_mat_q4_0_f32_1d_8x_flat", &err), err));
+    CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_1d_16x_flat     = clCreateKernel(backend_ctx->program_1, "kernel_mul_mat_q4_0_f32_1d_16x_flat", &err), err));
+    CL_CHECK((backend_ctx->kernel_mul_mv_q6_K_f32                  = clCreateKernel(backend_ctx->program_1, "kernel_mul_mv_q6_K_f32", &err), err));
+    CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_flat_v0         = clCreateKernel(backend_ctx->program_1, "kernel_mul_mat_q4_0_f32_flat_v0", &err), err));
+    CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_flat_img_v0     = clCreateKernel(backend_ctx->program_1, "kernel_mul_mat_q4_0_f32_flat_img_v0", &err), err));
+
+    // Load additional data conversion kernels.
+#ifdef GGML_OPENCL_EMBED_KERNELS
+    const std::string kernel_src_2 {
+        #include "ggml-opencl_cvt.cl.h"
+    };
+#else
+    const std::string kernel_src_2 = read_file("ggml-opencl_cvt.cl");
+#endif
+    backend_ctx->program_2 = build_program_from_source(context, device, kernel_src_2.c_str(), compile_opts);
+
+    CL_CHECK((backend_ctx->kernel_convert_block_q4_0_noshuffle     = clCreateKernel(backend_ctx->program_2, "kernel_convert_block_q4_0_noshuffle", &err), err));
+
+    // Kernels for Adreno
+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+#ifdef GGML_OPENCL_EMBED_KERNELS
+    const std::string transpose_32_src {
+        #include "ggml-opencl_transpose_32.cl.h"
+    };
+#else
+    const std::string transpose_32_src = read_file("ggml-opencl_transpose_32.cl");
+#endif
+    backend_ctx->program_transpose_32 = build_program_from_source(context, device, transpose_32_src.c_str(), compile_opts);
+    CL_CHECK((backend_ctx->kernel_transpose_32 = clCreateKernel(backend_ctx->program_transpose_32, "kernel_transpose_32", &err), err));
+
+#ifdef GGML_OPENCL_EMBED_KERNELS
+    const std::string transpose_32_16_src {
+        #include "ggml-opencl_transpose_32_16.cl.h"
+    };
+#else
+    const std::string transpose_32_16_src = read_file("ggml-opencl_transpose_32_16.cl");
+#endif
+    backend_ctx->program_transpose_32_16 = build_program_from_source(context, device, transpose_32_16_src.c_str(), compile_opts);
+    CL_CHECK((backend_ctx->kernel_transpose_32_16 = clCreateKernel(backend_ctx->program_transpose_32_16, "kernel_transpose_32_16", &err), err));
+
+#ifdef GGML_OPENCL_EMBED_KERNELS
+    const std::string transpose_16_src {
+        #include "ggml-opencl_transpose_16.cl.h"
+    };
+#else
+    const std::string transpose_16_src = read_file("ggml-opencl_transpose_16.cl");
+#endif
+    backend_ctx->program_transpose_16 = build_program_from_source(context, device, transpose_16_src.c_str(), compile_opts);
+    CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose_16, "kernel_transpose_16", &err), err));
+
+    // Gemv general
+    std::string CL_gemv_compile_opts =
+        " -cl-std=CL2.0 "
+        " -cl-mad-enable "
+        " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
+    if (has_vector_subgroup_broadcast) {
+        CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
+    }
+#ifdef GGML_OPENCL_EMBED_KERNELS
+    const std::string kernel_src_CL_gemv_general {
+        #include "ggml-opencl_gemv_noshuffle_general.cl.h"
+    };
+#else
+    const std::string kernel_src_CL_gemv_general = read_file("ggml-opencl_gemv_noshuffle_general.cl");
+#endif
+
+    backend_ctx->program_CL_gemv_general = build_program_from_source(
+        context, device, kernel_src_CL_gemv_general.c_str(), CL_gemv_compile_opts);
+    CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general = clCreateKernel(backend_ctx->program_CL_gemv_general, "kernel_gemv_noshuffle", &err), err));
+
+    // Gemv 2048, 16384
+    CL_gemv_compile_opts =
+        " -cl-std=CL2.0 "
+        " -cl-mad-enable "
+        " -DLINE_STRIDE_A=2048 "
+        " -DBLOCK_STRIDE_A=16384 "
+        " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
+    if (has_vector_subgroup_broadcast) {
+        CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
+    }
+#ifdef GGML_OPENCL_EMBED_KERNELS
+    const std::string kernel_src_CL_gemv {
+        #include "ggml-opencl_gemv_noshuffle.cl.h"
+    };
+#else
+    const std::string kernel_src_CL_gemv = read_file("ggml-opencl_gemv_noshuffle.cl");
+#endif
+
+    backend_ctx->program_CL_gemv_4096_1_4096 = build_program_from_source(
+        context, device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
+    CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_4096, "kernel_gemv_noshuffle", &err), err));
+
+    // Gemv 2048, 16384
+    CL_gemv_compile_opts =
+        " -cl-std=CL2.0 "
+        " -cl-mad-enable "
+        " -DLINE_STRIDE_A=2048 "
+        " -DBLOCK_STRIDE_A=16384 "
+        " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
+    if (has_vector_subgroup_broadcast) {
+        CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
+    }
+
+    backend_ctx->program_CL_gemv_4096_1_11008 = build_program_from_source(
+        context, device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
+    CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_11008, "kernel_gemv_noshuffle", &err), err));
+
+    // Gemv 5504, 44032
+    CL_gemv_compile_opts =
+        " -cl-std=CL2.0 "
+        " -cl-mad-enable "
+        " -DLINE_STRIDE_A=5504 "
+        " -DBLOCK_STRIDE_A=44032 "
+        " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
+    if (has_vector_subgroup_broadcast) {
+        CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
+    }
+
+    backend_ctx->program_CL_gemv_11008_1_4096 = build_program_from_source(
+        context, device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
+    CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_11008_1_4096, "kernel_gemv_noshuffle", &err), err));
+
+    // Gemv 16000, 128000
+    CL_gemv_compile_opts =
+        " -cl-std=CL2.0 "
+        " -cl-mad-enable "
+        " -DLINE_STRIDE_A=16000 "
+        " -DBLOCK_STRIDE_A=128000 "
+        " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
+    if (has_vector_subgroup_broadcast) {
+        CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
+    }
+
+    backend_ctx->program_CL_gemv_32000_1_4096 = build_program_from_source(context, device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
+    CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_32000_1_4096, "kernel_gemv_noshuffle", &err), err));
+
+    // Gemm
+#ifdef GGML_OPENCL_EMBED_KERNELS
+    const std::string kernel_src_CL_gemm {
+        #include "ggml-opencl_mul_mat_Ab_Bi_8x4.cl.h"
+    };
+#else
+    const std::string kernel_src_CL_gemm = read_file("ggml-opencl_mul_mat_Ab_Bi_8x4.cl");
+#endif
+    backend_ctx->program_CL_gemm = build_program_from_source(context, device, kernel_src_CL_gemm.c_str(), compile_opts);
+    CL_CHECK((backend_ctx->CL_mul_mat_Ab_Bi_8x4 = clCreateKernel(backend_ctx->program_CL_gemm, "kernel_mul_mat_Ab_Bi_8x4", &err), err));
+
+    // Allocate intermediate buffers and images
+    size_t max_A_q_d_bytes = 311164928;
+    size_t max_A_s_d_bytes = 38895616;
+    size_t max_B_d_bytes = 45088768;
+
+    CL_CHECK((backend_ctx->A_q_d_max = clCreateBuffer(context, 0, max_A_q_d_bytes, NULL, &err), err));
+    CL_CHECK((backend_ctx->A_s_d_max = clCreateBuffer(context, 0, max_A_s_d_bytes, NULL, &err), err));
+    CL_CHECK((backend_ctx->B_d_max   = clCreateBuffer(context, 0, max_B_d_bytes,   NULL, &err), err));
+#endif // GGML_OPENCL_USE_ADRENO_KERNELS
+
+    // For now we support a single devices
+    ggml_backend_opencl_n_devices = 1;
+
+    return backend_ctx;
+}
+
+static void ggml_cl2_free(void) {
+#ifdef GGML_OPENCL_PROFILING
+    FILE * fperf = fopen("cl_profiling.csv", "w");
+    if (!fperf) {
+        GGML_LOG_ERROR("Failed to open cl_profiling.csv\n");
+        return;
+    }
+
+    float total_kernel_time = 0;
+    fprintf(fperf, "op name, kernel name, duration (ms), global size, local size, output size\n");
+    for (const ProfilingInfo & info : g_profiling_info) {
+        total_kernel_time += info.duration_ns/1.e6f;
+        fprintf(fperf, "%s,%s,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
+            info.op_name.c_str(), info.kernel_name.c_str(), info.duration_ns/1.e6f,
+            info.global_size[0], info.global_size[1], info.global_size[2],
+            info.local_size[0], info.local_size[2], info.local_size[2],
+            info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]);
+    }
+    fclose(fperf);
+
+    GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time);
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Tensor extra management
+//------------------------------------------------------------------------------
+struct ggml_tensor_extra_cl {
+    // The buffer object that holds the data.
+    cl_mem data_device;
+    // The offset into the buffer object. This is primarily for scratch buffer
+    // and view operation.
+    // NB: this offset no longer includes view offset (view_offs). Whenever this
+    // offset is used, view_offs should be considered.
+    cl_ulong offset;
+    // The actual size of the cl_mem object. This is needed when returning the
+    // block to the pool.
+    size_t actual_size;
+
+    void reset() {
+        data_device = nullptr;
+        offset = 0;
+        actual_size = 0;
+    }
+};
+
+// Additional tensor extra structs for quantized tensors.
+// These tensors are loaded from files and should not be allocated in scratch --
+// they should always be allocated from the pool. Hence, they do not have an
+// `offset`, which indicate their locations in the scratch buffer.
+struct ggml_tensor_extra_cl_q4_0 {
+    // Quantized values.
+    cl_mem q = nullptr;
+    // Quantized values in image1d_buffer_t.
+    cl_mem q_img = nullptr;
+    // Scales.
+    cl_mem d = nullptr;
+    // Scales in image1d_buffer_t.
+    cl_mem d_img = nullptr;
+    // Size of quantized values.
+    size_t size_q = 0;
+    // Size of scales.
+    size_t size_d = 0;
+
+    ~ggml_tensor_extra_cl_q4_0() {
+        reset();
+    }
+
+    void reset() {
+        // q and d are subbuffers into the bigger buffer allocated in ggml_backend_buffer.
+        // They must be properly released so that the original buffer can be
+        // properly released to avoid memory leak.
+        if (q != nullptr) {
+            CL_CHECK(clReleaseMemObject(q));
+            q = nullptr;
+        }
+        if (d != nullptr) {
+            CL_CHECK(clReleaseMemObject(d));
+            d = nullptr;
+        }
+        // Currently, q_img and d_img are only initialized when SMALL_ALLOC is
+        // enabled. They point to the images in ggml_backend_opencl_buffer_context.
+        // So, there is no need to release them here.
+        // TODO: initialize them for non SMALL_PATH path, or remove them.
+        q_img = nullptr;
+        d_img = nullptr;
+        size_q = 0;
+        size_d = 0;
+    }
+};
+
+//------------------------------------------------------------------------------
+// Backend API
+//------------------------------------------------------------------------------
+
+//
+// backend
+//
+static const char * ggml_backend_opencl_name(ggml_backend_t backend) {
+    return "OpenCL";
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_opencl_free(ggml_backend_t backend) {
+    ggml_cl2_free();
+
+    GGML_UNUSED(backend);
+}
+
+static void ggml_backend_opencl_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    GGML_UNUSED(backend);
+    GGML_UNUSED(tensor);
+    GGML_UNUSED(data);
+    GGML_UNUSED(offset);
+    GGML_UNUSED(size);
+}
+
+static void ggml_backend_opencl_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    GGML_UNUSED(backend);
+    GGML_UNUSED(tensor);
+    GGML_UNUSED(data);
+    GGML_UNUSED(offset);
+    GGML_UNUSED(size);
+}
+
+static bool ggml_backend_opencl_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
+    GGML_UNUSED(backend);
+    GGML_UNUSED(src);
+    GGML_UNUSED(dst);
+    return false;
+}
+
+static void ggml_backend_opencl_synchronize(ggml_backend_t backend) {
+    GGML_UNUSED(backend);
+}
+
+static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        ggml_tensor * node = cgraph->nodes[i];
+
+        if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
+            continue;
+        }
+
+        bool ok = ggml_cl_compute_forward(backend, node);
+        if (!ok) {
+            GGML_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
+        }
+        GGML_ASSERT(ok);
+    }
+
+    return GGML_STATUS_SUCCESS;
+}
+
+static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
+    GGML_UNUSED(dev);
+
+    switch (op->op) {
+        case GGML_OP_NONE:
+            return true;
+        case GGML_OP_GET_ROWS:
+            switch (op->src[0]->type) {
+                case GGML_TYPE_F32:
+                case GGML_TYPE_F16:
+                    return true;
+                case GGML_TYPE_Q4_0:
+#ifdef GGML_OPENCL_SOA_Q
+                    // We do not support flattened Q4_0 (and possibly other Q's)
+                    return false;
+#else // GGML_OPENCL_SOA_Q
+                    return true;
+#endif // GGML_OPENCL_SOA_Q
+                default:
+                    return false;
+            }
+        case GGML_OP_CPY:
+        case GGML_OP_DUP:
+        case GGML_OP_CONT:
+            switch (op->src[0]->type) {
+                case GGML_TYPE_F32:
+                    switch (op->type) {
+                        case GGML_TYPE_F16:
+                        case GGML_TYPE_F32:
+                            return true;
+                        default:
+                            return false;
+                    }
+                case GGML_TYPE_F16:
+                    switch (op->type) {
+                        case GGML_TYPE_F16:
+                        case GGML_TYPE_F32:
+                            return true;
+                        default:
+                            return false;
+                    }
+                default:
+                    return false;
+            }
+        case GGML_OP_ADD:
+        case GGML_OP_SCALE:
+        case GGML_OP_MUL:
+            return true;
+        case GGML_OP_UNARY:
+            switch (ggml_get_unary_op(op)) {
+                case GGML_UNARY_OP_GELU:
+                case GGML_UNARY_OP_SILU:
+                case GGML_UNARY_OP_RELU:
+                   return ggml_is_contiguous(op->src[0]);
+                default:
+                    return false;
+            }
+        case GGML_OP_CLAMP:
+        case GGML_OP_SOFT_MAX:
+        case GGML_OP_NORM:
+        case GGML_OP_RMS_NORM:
+            return true;
+        case GGML_OP_MUL_MAT:
+            if (op->src[0]->type == GGML_TYPE_F16) {
+                return true;
+            } else if (op->src[0]->type == GGML_TYPE_F32) {
+                return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
+            } else if (op->src[0]->type == GGML_TYPE_Q4_0 ||
+                       op->src[0]->type == GGML_TYPE_Q6_K) {
+                return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
+            }
+            return false;
+        case GGML_OP_RESHAPE:
+        case GGML_OP_VIEW:
+        case GGML_OP_PERMUTE:
+        case GGML_OP_TRANSPOSE:
+            return true;
+        case GGML_OP_DIAG_MASK_INF:
+            return op->ne[3] == 1;
+        case GGML_OP_ROPE:
+            return true;
+        default:
+            return false;
+    }
+}
+
+// Forward declaration - implementation appears later in the file.
+static const char * ggml_backend_opencl_buffer_type_get_name(ggml_backend_buffer_type_t buffer_type);
+
+static ggml_guid_t ggml_backend_opencl_guid() {
+    static ggml_guid guid = { 0xde, 0xe0, 0x70, 0xa2, 0x73, 0x4e, 0x4d, 0xbc, 0xb0, 0xc7, 0x4f, 0xd4, 0x6d, 0x4e, 0x90, 0xfe };
+    return &guid;
+}
+
+static ggml_backend_i ggml_backend_opencl_i = {
+    /* .get_name                = */ ggml_backend_opencl_name,
+    /* .free                    = */ ggml_backend_opencl_free,
+    /* .set_tensor_async        = */ NULL,  /* ggml_backend_opencl_set_tensor_async */
+    /* .get_tensor_async        = */ NULL,  /* ggml_backend_opencl_get_tensor_async */
+    /* .cpy_tensor_async        = */ NULL,  /* ggml_backend_opencl_cpy_tensor_async */
+    /* .synchronize             = */ NULL,  /* ggml_backend_opencl_synchronize */
+    /* .graph_plan_create       = */ NULL,
+    /* .graph_plan_free         = */ NULL,
+    /* .graph_plan_update       = */ NULL,
+    /* .graph_plan_compute      = */ NULL,
+    /* .graph_compute           = */ ggml_backend_opencl_graph_compute,
+    /* .event_record            = */ NULL,
+    /* .event_wait              = */ NULL,
+};
+
+ggml_backend_t ggml_backend_opencl_init(void) {
+    ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_opencl_reg(), 0);
+    ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(dev);
+
+    ggml_backend_t backend = new ggml_backend {
+        /* .guid      = */ ggml_backend_opencl_guid(),
+        /* .interface = */ ggml_backend_opencl_i,
+        /* .device    = */ dev,
+        /* .context   = */ backend_ctx
+    };
+
+    return backend;
+}
+
+bool ggml_backend_is_opencl(ggml_backend_t backend) {
+    return backend && backend->iface.get_name == ggml_backend_opencl_name;
+}
+
+//
+// buffer
+//
+struct ggml_backend_opencl_buffer_context {
+    // A buffer context can hold multiple cl_mem objects. This is for flattening
+    // quantized weights and should be used with GGML_OPENCL_SMALL_ALLOC where
+    // each tensor is allocated a separate buffer. When flattening is enabled
+    // with small allocation, each tensor is backed by two cl_mem objects (for
+    // quants and scales) packed into a backend_opencl_buffer.
+    ggml_backend_opencl_buffer_context(cl_mem buf)
+        : name("OpenCL") {
+        buffer.push_back(buf);
+    }
+
+    ~ggml_backend_opencl_buffer_context() {
+        for (cl_mem buf : buffer) {
+            CL_CHECK(clReleaseMemObject(buf));
+        }
+        for (cl_mem im : img) {
+            CL_CHECK(clReleaseMemObject(im));
+        }
+
+        // Delete all extras to trigger their destructors
+        for (ggml_tensor_extra_cl * e : temp_tensor_extras) {
+            delete e;
+        }
+        for (ggml_tensor_extra_cl * e : temp_tensor_extras_in_use) {
+            delete e;
+        }
+        for (ggml_tensor_extra_cl_q4_0 * e : temp_tensor_extras_q4_0) {
+            delete e;
+        }
+        for (ggml_tensor_extra_cl_q4_0 * e : temp_tensor_extras_q4_0_in_use) {
+            delete e;
+        }
+    }
+
+    ggml_tensor_extra_cl * ggml_opencl_alloc_temp_tensor_extra() {
+        ggml_tensor_extra_cl * extra;
+        if (temp_tensor_extras.empty()) {
+            extra = new ggml_tensor_extra_cl();
+        } else {
+            extra = temp_tensor_extras.back();
+            temp_tensor_extras.pop_back();
+        }
+
+        temp_tensor_extras_in_use.push_back(extra);
+
+        extra->reset();
+        return extra;
+    }
+
+    ggml_tensor_extra_cl_q4_0 * ggml_opencl_alloc_temp_tensor_extra_q4_0() {
+        ggml_tensor_extra_cl_q4_0 * extra;
+        if (temp_tensor_extras_q4_0.empty()) {
+            extra = new ggml_tensor_extra_cl_q4_0();
+        } else {
+            extra = temp_tensor_extras_q4_0.back();
+            temp_tensor_extras_q4_0.pop_back();
+        }
+
+        temp_tensor_extras_q4_0_in_use.push_back(extra);
+
+        extra->reset();
+        return extra;
+    }
+
+    void reset() {
+        for (ggml_tensor_extra_cl * e : temp_tensor_extras_in_use) {
+            temp_tensor_extras.push_back(e);
+        }
+        temp_tensor_extras_in_use.clear();
+
+        for (ggml_tensor_extra_cl_q4_0 * e : temp_tensor_extras_q4_0_in_use) {
+            temp_tensor_extras_q4_0.push_back(e);
+        }
+        temp_tensor_extras_q4_0_in_use.clear();
+    }
+
+    // Pools for extras. Available extras are in `temp_tensor_extras`. Extras
+    // being used are in `temp_tensor_extras_in_use`. At the first run, new
+    // extras get created and put in `in_use`. When the buffer is reset via
+    // the `reset` callback, all extras in `in_use` get moved to available extras
+    // for reuse.
+    std::vector<ggml_tensor_extra_cl *> temp_tensor_extras;
+    std::vector<ggml_tensor_extra_cl *> temp_tensor_extras_in_use;
+    std::vector<ggml_tensor_extra_cl_q4_0 *> temp_tensor_extras_q4_0;
+    std::vector<ggml_tensor_extra_cl_q4_0 *> temp_tensor_extras_q4_0_in_use;
+
+    // The buffer_context is initially created by ggml_backend_buft_alloc_buffer
+    // before any tensor is initialized (at the beginning of alloc_tensor_range).
+    // Hence, there is alway a buffer object in this vector. When each tensor is
+    // being initialized, this original buffer object will be released if both
+    // flattening and small allocation are enabled, and additional buffer
+    // objects will be created in init_tensor to represent flattened quantized
+    // weights.
+    std::vector<cl_mem> buffer;
+    // These are image1d_buffer_t objects that wrap around the quants and scales.
+    // For Q4_0 quantization, there should be two of them - one for quants and
+    // one for scales. They should be populated only when flattening and small
+    // allocation are enabled.
+    std::vector<cl_mem> img;
+    std::string name;
+};
+
+static void * const cl_ptr_base = (void *)(uintptr_t) 0x1000;
+
+static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
+    delete ctx;
+}
+
+static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
+    return cl_ptr_base;
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+    ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
+
+    ggml_cl2_init(buffer->buft->device);
+
+    if (tensor->view_src != nullptr) {
+        GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
+
+        ggml_tensor_extra_cl * view_extra = (ggml_tensor_extra_cl *) tensor->view_src->extra;
+        GGML_ASSERT(view_extra && "view_extra is nullptr?");
+
+        // Reuse extra of the parent tensor. The offset of this view tensor
+        // becomes `extra->offset + view_offs` and needs to be calculated when
+        // it is used. This changes is needed because of the change to
+        // ggml_alloc.c in https://github.com/ggerganov/llama.cpp/pull/7640.
+        // `buffer` passed in here will always be `tensor->buffer`. It is OK
+        // to allocate extras from the same buffer context for ordinary
+        // intermediate tensors. But for views into kv cache tensors, doing so
+        // would mess up the extras used by kv cache.
+        // Before #7640, `buffer` is for intermediate tensors, which is always
+        // different from that of kv cache tensors.
+        //
+        // NB: now extra->offset no longer accounts for view_offs.
+        // NB: this should not apply to weight tensors (for end-to-end runs, but
+        //     may apply for test-backend-ops).
+        // FIXME: if any unexpected results are seen, double check the offset -
+        // there could be other places that need fix.
+        tensor->extra = view_extra;
+    } else {
+        {
+            size_t offset = (char *)tensor->data - (char *)cl_ptr_base;
+
+            ggml_tensor_extra_cl * extra = ctx->ggml_opencl_alloc_temp_tensor_extra();
+            extra->offset = offset;
+            extra->data_device = ctx->buffer[0];
+            extra->actual_size = ggml_nbytes(tensor);
+
+            tensor->extra = extra;
+        }
+    }
+}
+
+// The optimized gemm and gemv kernels are used for large matrices without batch.
+// tensor is the quantized weights matrix.
+inline bool use_adreno_kernels(const ggml_tensor *tensor) {
+    return tensor->ne[0] >= 512 && tensor->ne[1] >= 512 &&
+            tensor->ne[2] == 1 && tensor->ne[3] == 1;
+}
+
+static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device);
+
+    cl_context context = backend_ctx->context;
+    cl_command_queue queue = backend_ctx->queue;
+
+#ifdef GGML_OPENCL_SOA_Q
+    // We separate the quantized bits and scale from block_q4_0 by using an
+    // additional kernel, where each thread handles a block. We first read the
+    // original weights into a temporary buffer, then create two separate
+    // buffers for quantized bits and scales, which are then populated by the
+    // conversion kernel.
+    if (tensor->type == GGML_TYPE_Q4_0) {
+        // Tensors should have been preallocated, therefore they should
+        // already have ggml_tensor_extra_cl as extra.
+        ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
+        GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized");
+
+        // Allocate the new extra and create aliases from the original.
+        ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
+        ggml_tensor_extra_cl_q4_0 * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_q4_0();
+
+        size_t size_d = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t);
+        size_t size_q = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/2;
+        GGML_ASSERT(size_d + size_q == ggml_nbytes(tensor) && "Incorrect tensor size");
+
+        cl_int err;
+        cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
+            ggml_nbytes(tensor), NULL, &err);
+        CL_CHECK(err);
+        CL_CHECK(clEnqueueWriteBuffer(
+            queue, data_device, CL_TRUE, 0,
+            ggml_nbytes(tensor), data, 0, NULL, NULL));
+
+        // We consider the specified offset arg as always, although For weights
+        // the offset arg should be 0 (we do not assert this).
+        //GGML_ASSERT(offset == 0);
+
+        // We create subbuffers from the original tensor buffer for scales and
+        // quants - i.e., scales and quants are aliases into the buffer obejct
+        // that backs the original tensor. This is a cleaner way to adapt to the
+        // new memory management.
+        // In the old code, we allocate new buffers for scales and quants
+        // respectively, which could still be done but would result in double
+        // allocation; properly deallocating the preallocated buffer that backs
+        // the tensors is tricky and would leak the backend specific information
+        // into the general backend code.
+        // Does this create misaligned subbuffers (alignment is 1024) in certain
+        // cases ?
+        cl_buffer_region region;
+
+        // The original tensor memory is divided into scales and quants, i.e.,
+        // we first store scales, then quants.
+        // Create subbuffer for scales.
+        region.origin = extra_orig->offset + tensor->view_offs + offset;
+        region.size = size_d;
+        extra->d = clCreateSubBuffer(
+            extra_orig->data_device, CL_MEM_READ_WRITE,
+            CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
+        CL_CHECK(err);
+
+        // Create subbuffer for quants.
+        region.origin = extra_orig->offset + tensor->view_offs + offset + size_d;
+        region.size = size_q;
+        extra->q = clCreateSubBuffer(
+            extra_orig->data_device, CL_MEM_READ_WRITE,
+            CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
+        CL_CHECK(err);
+
+        //cl_kernel kernel = backend_ctx->kernel_convert_block_q4_0;
+    #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+        cl_kernel kernel = backend_ctx->kernel_convert_block_q4_0;
+
+        // The optimized kernels need weights in natural order, so unshuffle.
+        if (use_adreno_kernels(tensor)) {
+            kernel = backend_ctx->kernel_convert_block_q4_0_noshuffle;
+        }
+    #else
+        cl_kernel kernel = backend_ctx->kernel_convert_block_q4_0;
+    #endif // GGML_OPENCL_USE_ADRENO_KERNELS
+        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
+        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
+        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d));
+
+        size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
+        size_t local_work_size[] = {64, 1, 1};
+
+        cl_event evt;
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+        CL_CHECK(clWaitForEvents(1, &evt));
+        CL_CHECK(clReleaseMemObject(data_device));
+
+        tensor->extra = extra;
+
+        // transpose the weights and scales
+    #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+        // Only do transpose for large, non batched matrix
+        // TODO: use preallocated images instead of sub-buffer then image
+        if (use_adreno_kernels(tensor)) {
+        // <----------------------------------------------------------------------------------> //
+        // start transpose
+        // <----------------------------------------------------------------------------------> //
+        int M = tensor->ne[1];   // ne01
+        int K = tensor->ne[0];   // ne00
+
+        // transpose is out of place, so we need to allocate transposed buffers
+        // <----------------------------------------------------------------------------------> //
+        // use sub_buffer of max buffer size instead
+
+        size_t q_size_bytes = K * M / 8 * sizeof(float);
+        cl_buffer_region region;
+        region.origin = 0;
+        region.size = q_size_bytes;
+        cl_mem qT_d = clCreateSubBuffer(
+            backend_ctx->A_q_d_max,
+            0,
+            CL_BUFFER_CREATE_TYPE_REGION,
+            &region,
+            &err);
+        // cl_mem qT_d = clCreateBuffer(context, CL_MEM_READ_WRITE, q_size_bytes, NULL, &err);
+        CL_CHECK(err);
+
+        // size_t d_size_bytes = M * (K / 32) / 2 * sizeof(float);
+        size_t d_size_bytes = M * (K / 32) * 2;
+        region.origin = 0;
+        region.size = d_size_bytes;
+        cl_mem dT_d = clCreateSubBuffer(
+            backend_ctx->A_s_d_max,
+            0,
+            CL_BUFFER_CREATE_TYPE_REGION,
+            &region,
+            &err);
+        // cl_mem dT_d = clCreateBuffer(context, CL_MEM_READ_WRITE, d_size_bytes, NULL, &err);
+        CL_CHECK(err);
+
+        // <----------------------------------------------------------------------------------> //
+
+
+        // create images from the buffers
+        // <----------------------------------------------------------------------------------> //
+        cl_mem q_d_image1D;
+        cl_mem d_d_image1D;
+        cl_mem qT_d_image1D;
+        cl_mem dT_d_image1D;
+
+        cl_image_format img_fmt_1d = { CL_RGBA, CL_FLOAT };
+        cl_image_desc img_desc_1d;
+
+        memset(&img_desc_1d, 0, sizeof(img_desc_1d));
+        img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+        img_desc_1d.image_width = M * K / 8 / 4;
+        img_desc_1d.buffer = extra->q;
+        q_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
+        CL_CHECK(err);
+
+        img_fmt_1d = { CL_RGBA, CL_FLOAT };
+        memset(&img_desc_1d, 0, sizeof(img_desc_1d));
+        img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+        img_desc_1d.image_width = M * K / 8 / 4;
+        img_desc_1d.buffer = qT_d;
+        qT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
+        CL_CHECK(err);
+
+        img_fmt_1d = { CL_RGBA, CL_FLOAT };
+        memset(&img_desc_1d, 0, sizeof(img_desc_1d));
+        img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+        img_desc_1d.image_width = M * K / 32 / 4 / 2;
+        img_desc_1d.buffer = extra->d;
+        d_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
+        CL_CHECK(err);
+
+        img_fmt_1d = { CL_RGBA, CL_FLOAT };
+        memset(&img_desc_1d, 0, sizeof(img_desc_1d));
+        img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+        img_desc_1d.image_width = M * K / 32 / 4 / 2;
+        img_desc_1d.buffer = dT_d;
+        dT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
+        CL_CHECK(err);
+        // <----------------------------------------------------------------------------------> //
+
+        // set up and call the transpose kernels
+        // <----------------------------------------------------------------------------------> //
+        // weights
+        int height_q = M / 8;
+        int width_q = K / 8 / 4;
+        kernel = backend_ctx->kernel_transpose_16;
+
+        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_d_image1D));
+        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &qT_d_image1D));
+        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int),    &height_q));
+        CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int),    &width_q));
+
+        size_t local_size_q[3] = {4, 16, 1};
+        size_t global_size_q[3] = {static_cast<size_t>(width_q), static_cast<size_t>(height_q), 1};
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_q, local_size_q, 0, NULL, &evt));
+        CL_CHECK(clWaitForEvents(1, &evt));
+
+        // scales
+        int height_s = M / 8;
+        int width_s = K / 32 / 8;
+
+        kernel = backend_ctx->kernel_transpose_16;
+        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_d_image1D));
+        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &dT_d_image1D));
+        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_s));
+        CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_s));
+
+        size_t local_size_s[3] = {4, 16, 1};
+        size_t global_size_s[3] = {static_cast<size_t>(width_s), static_cast<size_t>(height_s), 1};
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_s, local_size_s, 0, NULL, &evt));
+        CL_CHECK(clWaitForEvents(1, &evt));
+        // <----------------------------------------------------------------------------------> //
+
+        // copy transposed buffer contents to original buffers
+        // <----------------------------------------------------------------------------------> //
+        // weights
+        CL_CHECK(clEnqueueCopyBuffer(queue, qT_d, extra->q, 0, 0, q_size_bytes, 0, NULL, &evt));
+        CL_CHECK(clWaitForEvents(1, &evt));
+
+        // scales
+        CL_CHECK(clEnqueueCopyBuffer(queue, dT_d, extra->d, 0, 0, d_size_bytes, 0, NULL, &evt));
+        CL_CHECK(clWaitForEvents(1, &evt));
+        // <----------------------------------------------------------------------------------> //
+
+        // deallocate transpose buffers
+        // <----------------------------------------------------------------------------------> //
+        CL_CHECK(clReleaseMemObject(qT_d));
+        CL_CHECK(clReleaseMemObject(dT_d));
+
+        // deallocate temporary images
+        CL_CHECK(clReleaseMemObject(q_d_image1D));
+        CL_CHECK(clReleaseMemObject(d_d_image1D));
+        CL_CHECK(clReleaseMemObject(qT_d_image1D));
+        CL_CHECK(clReleaseMemObject(dT_d_image1D));
+        // <----------------------------------------------------------------------------------> //
+        // end transpose
+        // <----------------------------------------------------------------------------------> //
+        }
+    #endif // GGML_OPENCL_USE_ADRENO_KERNELS
+
+        return;
+    }
+#endif // GGML_OPENCL_SOA_Q
+
+    ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra;
+    GGML_ASSERT(extra);
+
+    CL_CHECK(clEnqueueWriteBuffer(
+        queue, extra->data_device, CL_TRUE, extra->offset + offset,
+        size, data, 0, NULL, NULL));
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    GGML_ASSERT(tensor->extra);
+
+    ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device);
+
+    cl_context context = backend_ctx->context;
+    cl_command_queue queue = backend_ctx->queue;
+
+    // Make sure all previously submitted commands are finished.
+    CL_CHECK(clFinish(queue));
+
+#ifdef GGML_OPENCL_SOA_Q
+    // In end-to-end runs, get_tensor is usually used to get back the logits,
+    // where we can simply do clEnqueueReadBuffer since they are f32.
+    // However, in test-backend-ops, the GPU graph is copied to the CPU backend,
+    // which requires reading back quantized weight tensors.
+    // To properly support this, we need to restore block_q4_0 struct arrays
+    // from the flattened buffers.
+    if (tensor->type == GGML_TYPE_Q4_0) {
+        ggml_tensor_extra_cl_q4_0 * extra = (ggml_tensor_extra_cl_q4_0 *)tensor->extra;
+
+        cl_int err;
+        cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
+            ggml_nbytes(tensor), NULL, &err);
+        CL_CHECK(err);
+
+        cl_kernel kernel = backend_ctx->kernel_restore_block_q4_0;
+        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
+        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->d));
+        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
+
+        size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
+        size_t local_work_size[] = {1, 1, 1};
+
+        cl_event evt;
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
+            global_work_size, local_work_size, 0, NULL, &evt));
+        CL_CHECK(clWaitForEvents(1, &evt));
+        CL_CHECK(clEnqueueReadBuffer(
+            queue, data_device, CL_TRUE, offset,
+            size, data, 0, NULL, NULL));
+        CL_CHECK(clReleaseMemObject(data_device));
+        return;
+    }
+#endif // GGML_OPENCL_SOA_Q
+
+    ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra;
+
+    CL_CHECK(clEnqueueReadBuffer(
+        queue, extra->data_device, CL_TRUE, extra->offset + tensor->view_offs + offset,
+        size, data, 0, NULL, NULL));
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_opencl_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    ggml_backend_dev_t dev = buffer->buft->device;
+    ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(dev);
+    cl_command_queue queue = backend_ctx->queue;
+
+    ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
+    for (cl_mem buf : ctx->buffer) {
+        CL_CHECK(clEnqueueFillBuffer(queue, buf, &value, sizeof(value), 0, buffer->size, 0, NULL, NULL));
+    }
+    CL_CHECK(clFinish(queue));
+}
+
+static void ggml_backend_opencl_buffer_reset(ggml_backend_buffer_t buffer) {
+    ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
+    ctx->reset();
+}
+
+static ggml_backend_buffer_i ggml_backend_opencl_buffer_interface = {
+    /* .free_buffer     = */ ggml_backend_opencl_buffer_free_buffer,
+    /* .get_base        = */ ggml_backend_opencl_buffer_get_base,
+    /* .init_tensor     = */ ggml_backend_opencl_buffer_init_tensor,
+    /* .memset_tensor   = */ NULL,
+    /* .set_tensor      = */ ggml_backend_opencl_buffer_set_tensor,
+    /* .get_tensor      = */ ggml_backend_opencl_buffer_get_tensor,
+    /* .cpy_tensor      = */ NULL,
+    /* .clear           = */ ggml_backend_opencl_buffer_clear,
+    /* .reset           = */ ggml_backend_opencl_buffer_reset,
+};
+
+//
+// buffer type
+//
+
+static const char * ggml_backend_opencl_buffer_type_get_name(ggml_backend_buffer_type_t buffer_type) {
+    return "OpenCL";
+
+    GGML_UNUSED(buffer_type);
+}
+
+static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buffer_type, size_t size) {
+    ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer_type->device);
+
+    // clCreateBuffer returns -61 for size 0
+    size = std::max(size, (size_t)1);
+
+    cl_int err;
+    cl_mem mem = clCreateBuffer(backend_ctx->context, CL_MEM_READ_WRITE, size, NULL, &err);
+    if (err != CL_SUCCESS) {
+        GGML_LOG_INFO("%s: failed to allocate %.2f MiB\n", __func__, size / 1024.0 / 1024.0);
+        return nullptr;
+    }
+
+    ggml_backend_opencl_buffer_context * ctx = new ggml_backend_opencl_buffer_context(mem);
+
+    return ggml_backend_buffer_init(buffer_type, ggml_backend_opencl_buffer_interface, ctx, size);
+}
+
+static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_type_t buffer_type) {
+    // FIXME: not thread safe, device may not be initialized yet
+    static cl_uint alignment = -1;
+    if (alignment == (cl_uint)-1) {
+        ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer_type->device);
+        alignment = backend_ctx->alignment;
+    }
+    return alignment;
+}
+
+static size_t ggml_backend_opencl_buffer_type_get_max_size(ggml_backend_buffer_type_t buffer_type) {
+    static size_t max_size = -1;
+    if (max_size == (size_t)-1) {
+        ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer_type->device);
+        max_size = backend_ctx->max_alloc_size;
+    }
+    return max_size;
+}
+
+static bool ggml_backend_opencl_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
+    return ggml_backend_is_opencl(backend);
+
+    UNUSED(buft);
+}
+
+static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = {
+    /* .get_name         = */ ggml_backend_opencl_buffer_type_get_name,
+    /* .alloc_buffer     = */ ggml_backend_opencl_buffer_type_alloc_buffer,
+    /* .get_alignment    = */ ggml_backend_opencl_buffer_type_get_alignment,
+    /* .get_max_size     = */ ggml_backend_opencl_buffer_type_get_max_size,
+    /* .get_alloc_size   = */ NULL,
+    /* .is_host          = */ NULL,
+};
+
+ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type() {
+    static ggml_backend_buffer_type buffer_type = {
+        /* .iface   = */ ggml_backend_opencl_buffer_type_interface,
+        /* .device  = */ &g_ggml_backend_opencl_device,
+        /* .context = */ nullptr,
+    };
+
+    return &buffer_type;
+}
+
+//
+// backend device
+//
+
+static const char * ggml_backend_opencl_device_get_name(ggml_backend_dev_t dev) {
+    return "GPUOpenCL";
+
+    GGML_UNUSED(dev);
+}
+
+static const char * ggml_backend_opencl_device_get_description(ggml_backend_dev_t dev) {
+    ggml_backend_opencl_device_context *dev_ctx = (ggml_backend_opencl_device_context *) dev->context;
+    return dev_ctx->device_name.c_str();
+}
+
+static void ggml_backend_opencl_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+    *free = 1;
+    *total = 1;
+
+    GGML_UNUSED(dev);
+}
+
+static enum ggml_backend_dev_type ggml_backend_opencl_device_get_type(ggml_backend_dev_t dev) {
+    return GGML_BACKEND_DEVICE_TYPE_GPU;
+
+    GGML_UNUSED(dev);
+}
+
+static void ggml_backend_opencl_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
+    props->name        = ggml_backend_opencl_device_get_name(dev);
+    props->description = ggml_backend_opencl_device_get_description(dev);
+    props->type        = ggml_backend_opencl_device_get_type(dev);
+    ggml_backend_opencl_device_get_memory(dev, &props->memory_free, &props->memory_total);
+    props->caps = ggml_backend_dev_caps {
+        /* .async                 = */ false,
+        /* .host_buffer           = */ false,
+        /* .buffer_from_host_ptr  = */ false,
+        /* .events                = */ false,
+    };
+}
+
+static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, const char * params) {
+    ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(dev);
+
+    ggml_backend_t backend = new ggml_backend {
+        /* .guid      = */ ggml_backend_opencl_guid(),
+        /* .interface = */ ggml_backend_opencl_i,
+        /* .device    = */ dev,
+        /* .context   = */ backend_ctx,
+    };
+
+    return backend;
+
+    GGML_UNUSED(params);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_opencl_device_get_buffer_type(ggml_backend_dev_t dev) {
+    return ggml_backend_opencl_buffer_type();
+
+    GGML_UNUSED(dev);
+}
+
+static ggml_backend_buffer_t ggml_backend_opencl_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
+    GGML_UNUSED(dev);
+    GGML_UNUSED(ptr);
+    GGML_UNUSED(size);
+    GGML_UNUSED(max_tensor_size);
+    return nullptr;
+}
+
+static bool ggml_backend_opencl_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
+    return ggml_opencl_supports_op(dev, op);
+}
+
+static bool ggml_backend_opencl_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
+    return buft->iface.get_name == ggml_backend_opencl_buffer_type_get_name;
+
+    GGML_UNUSED(dev);
+}
+
+static struct ggml_backend_device_i ggml_backend_opencl_device_i = {
+    /* .get_name             = */ ggml_backend_opencl_device_get_name,
+    /* .get_description      = */ ggml_backend_opencl_device_get_description,
+    /* .get_memory           = */ ggml_backend_opencl_device_get_memory,
+    /* .get_type             = */ ggml_backend_opencl_device_get_type,
+    /* .get_props            = */ ggml_backend_opencl_device_get_props,
+    /* .init_backend         = */ ggml_backend_opencl_device_init,
+    /* .get_buffer_type      = */ ggml_backend_opencl_device_get_buffer_type,
+    /* .get_host_buffer_type = */ NULL,
+    /* .buffer_from_host_ptr = */ ggml_backend_opencl_device_buffer_from_ptr,
+    /* .supports_op          = */ ggml_backend_opencl_device_supports_op,
+    /* .supports_buft        = */ ggml_backend_opencl_device_supports_buft,
+    /* .offload_op           = */ NULL,
+    /* .event_new            = */ NULL,
+    /* .event_free           = */ NULL,
+    /* .event_synchronize    = */ NULL,
+};
+
+// Backend registry
+
+static const char * ggml_backend_opencl_reg_get_name(ggml_backend_reg_t reg) {
+    return "OpenCL";
+
+    GGML_UNUSED(reg);
+}
+
+static size_t ggml_backend_opencl_reg_device_count(ggml_backend_reg_t reg) {
+    return ggml_backend_opencl_n_devices;
+
+    GGML_UNUSED(reg);
+}
+
+static ggml_backend_dev_t ggml_backend_opencl_reg_device_get(ggml_backend_reg_t reg, size_t index) {
+    GGML_ASSERT(index == 0);
+
+    return &g_ggml_backend_opencl_device;
+
+    GGML_UNUSED(reg);
+    GGML_UNUSED(index);
+}
+
+static struct ggml_backend_reg_i ggml_backend_opencl_reg_i = {
+    /* .get_name         = */ ggml_backend_opencl_reg_get_name,
+    /* .device_count     = */ ggml_backend_opencl_reg_device_count,
+    /* .device_get       = */ ggml_backend_opencl_reg_device_get,
+    /* .get_proc_address = */ NULL,
+};
+
+ggml_backend_reg_t ggml_backend_opencl_reg(void) {
+    // TODO: make this thread-safe somehow?
+    static ggml_backend_reg reg;
+    static bool initialized = false;
+
+    if (!initialized) {
+        reg = ggml_backend_reg {
+            /* .api_version = */ GGML_BACKEND_API_VERSION,
+            /* .iface   = */ ggml_backend_opencl_reg_i,
+            /* .context = */ NULL,
+        };
+
+        g_ggml_backend_opencl_device = ggml_backend_device {
+            /* .iface   = */ ggml_backend_opencl_device_i,
+            /* .reg     = */ &reg,
+            /* .context = */ &g_ggml_ctx_dev_main,
+        };
+
+        ggml_cl2_init(&g_ggml_backend_opencl_device);
+
+        initialized = true;
+    }
+
+    return &reg;
+}
+
+GGML_BACKEND_DL_IMPL(ggml_backend_opencl_reg)
+
+//------------------------------------------------------------------------------
+// Debugging utils
+//------------------------------------------------------------------------------
+#if 0
+#define QK4_0 32
+typedef struct {
+    ggml_fp16_t d;          // delta
+    uint8_t qs[QK4_0 / 2];  // nibbles / quants
+} block_q4_0;
+static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2,
+    "wrong q4_0 block size/padding");
+
+#include <math.h>
+#ifdef __cplusplus
+#include "half.hpp"
+#endif
+
+static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tensor) {
+    void * buf = malloc(ggml_nbytes(tensor));
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+    cl_command_queue queue = backend_ctx->queue;
+#ifdef GGML_OPENCL_SOA_Q
+    void * buf_q;
+    void * buf_d;
+#endif
+
+#ifdef GGML_USE_OPENCL
+    // Make sure everything is done.
+    CL_CHECK(clFinish(queue));
+
+#ifdef GGML_OPENCL_SOA_Q
+    if (tensor->type == GGML_TYPE_Q4_0) {
+        ggml_tensor_extra_cl_q4_0 * extra = (ggml_tensor_extra_cl_q4_0 *) tensor->extra;
+        GGML_ASSERT(extra);
+
+        size_t size_q = ggml_nelements(tensor)/QK4_0 * QK4_0/2;
+        size_t size_d = ggml_nelements(tensor)/QK4_0 * sizeof(ggml_fp16_t);
+        GGML_ASSERT(size_q + size_d == ggml_nbytes(tensor));
+        buf_q = malloc(size_q);
+        buf_d = malloc(size_d);
+
+        CL_CHECK(clEnqueueReadBuffer(queue, extra->q, CL_TRUE, 0, size_q, buf_q, 0, NULL, NULL));
+        CL_CHECK(clEnqueueReadBuffer(queue, extra->d, CL_TRUE, 0, size_d, buf_d, 0, NULL, NULL));
+        CL_CHECK(clFinish(queue));
+    } else {
+        // Read out the tensor from GPU memory.
+        ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra;
+        GGML_ASSERT(extra);
+
+        CL_CHECK(clEnqueueReadBuffer(queue, extra->data_device, CL_TRUE,
+        extra->offset, ggml_nbytes(tensor), buf, 0, NULL, NULL));
+        CL_CHECK(clFinish(queue));
+    }
+#else
+    // Read out the tensor from GPU memory.
+    ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra;
+    GGML_ASSERT(extra);
+
+    CL_CHECK(clEnqueueReadBuffer(queue, extra->data_device, CL_TRUE,
+        extra->offset, ggml_nbytes(tensor), buf, 0, NULL, NULL));
+    CL_CHECK(clFinish(queue));
+#endif // GGML_OPENCL_SOA_Q
+#endif // GGML_USE_OPENCL
+
+    // Open file and dump.
+    char fname[512];
+    sprintf(fname, "./tensor-dumps/%s.txt", tensor->name);
+    FILE * f = fopen(fname, "w");
+    if (!f) {
+        printf("Failed to open %s\n", fname);
+        return;
+    }
+
+    if (tensor->type == GGML_TYPE_F32) {
+        float * data = (float *) buf;
+        for (int i = 0; i < ggml_nelements(tensor); ++i) {
+            if (isnan(data[i])) {
+                printf("NaN found: %s\n", tensor->name);
+                break;
+            }
+            fprintf(f, "%f\n", data[i]);
+        }
+    } else if (tensor->type == GGML_TYPE_I32) {
+        int * data = (int *) buf;
+        for (int i = 0; i < ggml_nelements(tensor); ++i) {
+            if (isnan(data[i])) {
+                printf("NaN found: %s\n", tensor->name);
+                break;
+            }
+            fprintf(f, "%d\n", data[i]);
+        }
+    } else if (tensor->type == GGML_TYPE_F16) {
+#ifdef __cplusplus
+        half_float::half * data = (half_float::half *) buf;
+        for (int i = 0; i < ggml_nelements(tensor); ++i) {
+            if (std::isnan(data[i])) {
+                printf("NaN found: %s\n", tensor->name);
+                break;
+            }
+            fprintf(f, "%f\n", float(data[i]));
+        }
+#endif
+    } else if (tensor->type == GGML_TYPE_Q4_0) {
+#ifdef GGML_OPENCL_SOA_Q
+        ggml_fp16_t * data_d = (ggml_fp16_t *)buf_d;
+        unsigned char * data_q = (unsigned char *)buf_q;
+
+        for (int i = 0; i < ggml_nelements(tensor)/QK4_0; ++i) {
+            fprintf(f, "%04x, ", data_d[i]);
+            for (int k = 0; k < QK4_0/2; ++k) {
+                fprintf(f, "%02x, ", data_q[k]);
+            }
+            fprintf(f, "\n");
+            data_q += QK4_0/2;
+        }
+        free(buf_d);
+        free(buf_q);
+#else
+        block_q4_0 * data = (block_q4_0 *) buf;
+        for (int i = 0; i < ggml_nelements(tensor)/QK4_0; ++i) {
+            fprintf(f, "%04x, ", data[i].d);
+            for (int k = 0; k < QK4_0/2; ++k) {
+                fprintf(f, "%02x, ", data[i].qs[k]);
+            }
+            fprintf(f, "\n");
+        }
+#endif // GGML_OPENCL_SOA_Q
+    }
+    free(buf);
+    fflush(f);
+    fclose(f);
+}
+#else
+#define dump_tensor(tensor)
+#endif
+
+//------------------------------------------------------------------------------
+// Profiling utility
+//------------------------------------------------------------------------------
+#ifdef GGML_OPENCL_PROFILING
+void populateProfilingInfo(
+        ProfilingInfo& info, cl_event evt, cl_kernel kernel,
+        size_t global_size[3], size_t local_size[3],
+        const ggml_tensor * tensor) {
+    cl_ulong start;
+    cl_ulong end;
+    CL_CHECK(clWaitForEvents(1, &evt));
+    CL_CHECK(clGetEventProfilingInfo(
+        evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL));
+    CL_CHECK(clGetEventProfilingInfo(
+        evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL));
+
+    char kernel_name[512];
+    CL_CHECK(clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME,
+        sizeof(kernel_name), kernel_name, NULL));
+
+    info.duration_ns = end - start;
+    info.op_name = tensor->name;
+    info.kernel_name = kernel_name;
+    info.local_size[0]  = local_size[0];
+    info.local_size[1]  = local_size[1];
+    info.local_size[2]  = local_size[2];
+    info.global_size[0] = global_size[0];
+    info.global_size[1] = global_size[1];
+    info.global_size[2] = global_size[2];
+    info.output_size[0] = tensor->ne[0];
+    info.output_size[1] = tensor->ne[1];
+    info.output_size[2] = tensor->ne[2];
+    info.output_size[3] = tensor->ne[3];
+}
+#endif
+
+//------------------------------------------------------------------------------
+// Ops
+//------------------------------------------------------------------------------
+
+static bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
+    const int64_t ne10 = src1->ne[0];
+
+    const int64_t ne0 = dst->ne[0];
+    const int64_t ne1 = dst->ne[1];
+
+    // TODO: find the optimal values for these
+    return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
+            src1->type == GGML_TYPE_F32 &&
+             dst->type == GGML_TYPE_F32 &&
+            (ne0 >= 32 && ne1 >= 32 && ne10 >= 32);
+}
+
+static void ggml_cl_nop(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    UNUSED(backend);
+    UNUSED(src0);
+    UNUSED(src1);
+    UNUSED(dst);
+}
+
+static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(src1);
+    GGML_ASSERT(src1->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+
+    const int      ne00 = src0 ? src0->ne[0] : 0;
+    const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
+    const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
+    const int      ne10 = src1 ? src1->ne[0] : 0;
+    const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
+    const int      ne11 = src1 ? src1->ne[1] : 0;
+    const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
+    const cl_ulong nb1  = dst  ?  dst->nb[1] : 0;
+    const cl_ulong nb2  = dst  ?  dst->nb[2] : 0;
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+    cl_command_queue queue = backend_ctx->queue;
+
+    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+    cl_ulong offset0 = extra0->offset + src0->view_offs;
+    cl_ulong offset1 = extra1->offset + src1->view_offs;
+    cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+    cl_kernel kernel;
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            kernel = backend_ctx->kernel_get_rows_f32;
+            break;
+        case GGML_TYPE_F16:
+            kernel = backend_ctx->kernel_get_rows_f16;
+            break;
+        case GGML_TYPE_Q4_0:
+            kernel = backend_ctx->kernel_get_rows_q4_0;
+            break;
+        default:
+            GGML_ASSERT(false && "not implemented");
+    }
+
+    CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
+    CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
+    CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
+    CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
+    CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
+    CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
+    CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
+    CL_CHECK(clSetKernelArg(kernel,  7, sizeof(cl_ulong), &nb01));
+    CL_CHECK(clSetKernelArg(kernel,  8, sizeof(cl_ulong), &nb02));
+    CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne10));
+    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb10));
+    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb11));
+    CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb1));
+    CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb2));
+
+    size_t global_work_size[] = {(size_t)ne10, (size_t)ne11, 1};
+    size_t local_work_size[] = {1, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+    cl_event evt;
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+    g_profiling_info.emplace_back();
+    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+}
+
+static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(src1);
+    GGML_ASSERT(src1->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+
+    const int  ne00 = src0 ? src0->ne[0] : 0;
+    const int  ne01 = src0 ? src0->ne[1] : 0;
+    const int  ne02 = src0 ? src0->ne[2] : 0;
+    const int  ne03 = src0 ? src0->ne[3] : 0;
+
+    const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
+    const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
+    const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
+    const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
+
+    const int  ne10 = src1 ? src1->ne[0] : 0;
+    const int  ne11 = src1 ? src1->ne[1] : 0;
+    const int  ne12 = src1 ? src1->ne[2] : 0;
+    const int  ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
+
+    const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
+    const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
+    const cl_ulong nb12 = src1 ? src1->nb[2] : 0;
+    const cl_ulong nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13);
+
+    const int  ne0  = dst ? dst->ne[0] : 0;
+    const int  ne1  = dst ? dst->ne[1] : 0;
+    const int  ne2  = dst ? dst->ne[2] : 0;
+    const int  ne3  = dst ? dst->ne[3] : 0;
+
+    const cl_ulong nb0  = dst ? dst->nb[0] : 0;
+    const cl_ulong nb1  = dst ? dst->nb[1] : 0;
+    const cl_ulong nb2  = dst ? dst->nb[2] : 0;
+    const cl_ulong nb3  = dst ? dst->nb[3] : 0;
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+    cl_command_queue queue = backend_ctx->queue;
+
+    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+    cl_ulong offset0 = extra0->offset + src0->view_offs;
+    cl_ulong offset1 = extra1->offset + src1->view_offs;
+    cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+    bool bcast_row = false;
+    cl_kernel kernel;
+
+    if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
+        GGML_ASSERT(ggml_is_contiguous(src0));
+
+        // src1 is a row
+        GGML_ASSERT(ne11 == 1);
+
+        bcast_row = true;
+        int ne = ne00 / 4;
+        kernel = backend_ctx->kernel_add_row;
+
+        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
+        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extra1->data_device));
+        CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
+        CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem),   &extrad->data_device));
+        CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
+        CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),      &ne));
+    } else {
+        kernel = backend_ctx->kernel_add;
+
+        CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
+        CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
+        CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
+        CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
+        CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
+        CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
+        CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
+        CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne01));
+        CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne02));
+        CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne03));
+        CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
+        CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
+        CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
+        CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03));
+        CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &ne10));
+        CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &ne11));
+        CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int),      &ne12));
+        CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int),      &ne13));
+        CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb10));
+        CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb11));
+        CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb12));
+        CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb13));
+        CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int),      &ne0));
+        CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int),      &ne1));
+        CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int),      &ne2));
+        CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int),      &ne3));
+        CL_CHECK(clSetKernelArg(kernel, 26, sizeof(cl_ulong), &nb0));
+        CL_CHECK(clSetKernelArg(kernel, 27, sizeof(cl_ulong), &nb1));
+        CL_CHECK(clSetKernelArg(kernel, 28, sizeof(cl_ulong), &nb2));
+        CL_CHECK(clSetKernelArg(kernel, 29, sizeof(cl_ulong), &nb3));
+    }
+
+    if (bcast_row) {
+        int n = ggml_nelements(dst)/4;
+        size_t global_work_size[] = {(size_t)n, 1, 1};
+        size_t local_work_size[] = {64, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+        cl_event evt;
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+        g_profiling_info.emplace_back();
+        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+    } else {
+        unsigned int nth = MIN(64, ne0);
+        size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
+        size_t local_work_size[] = {nth, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+        cl_event evt;
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+        g_profiling_info.emplace_back();
+        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+    }
+}
+
+static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(src1);
+    GGML_ASSERT(src1->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+
+    const int ne00 = src0 ? src0->ne[0] : 0;
+    const int ne01 = src0 ? src0->ne[1] : 0;
+    const int ne02 = src0 ? src0->ne[2] : 0;
+    const int ne03 = src0 ? src0->ne[3] : 0;
+
+    const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
+    const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
+    const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
+    const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
+
+    const int ne10 = src1 ? src1->ne[0] : 0;
+    const int ne11 = src1 ? src1->ne[1] : 0;
+    const int ne12 = src1 ? src1->ne[2] : 0;
+    const int ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
+
+    const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
+    const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
+    const cl_ulong nb12 = src1 ? src1->nb[2] : 0;
+    const cl_ulong nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13);
+
+    const int ne0  = dst ? dst->ne[0] : 0;
+    const int ne1  = dst ? dst->ne[1] : 0;
+    const int ne2  = dst ? dst->ne[2] : 0;
+    const int ne3  = dst ? dst->ne[3] : 0;
+
+    const cl_ulong nb0  = dst ? dst->nb[0] : 0;
+    const cl_ulong nb1  = dst ? dst->nb[1] : 0;
+    const cl_ulong nb2  = dst ? dst->nb[2] : 0;
+    const cl_ulong nb3  = dst ? dst->nb[3] : 0;
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+    cl_command_queue queue = backend_ctx->queue;
+
+    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+    cl_ulong offset0 = extra0->offset + src0->view_offs;
+    cl_ulong offset1 = extra1->offset + src1->view_offs;
+    cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+    bool bcast_row = false;
+    cl_kernel kernel;
+
+    if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
+        GGML_ASSERT(ggml_is_contiguous(src0));
+
+        // src1 is a row
+        GGML_ASSERT(ne11 == 1);
+
+        bcast_row = true;
+        int ne = ne00 / 4;
+        kernel = backend_ctx->kernel_mul_row;
+
+        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
+        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extra1->data_device));
+        CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
+        CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem),   &extrad->data_device));
+        CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
+        CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),      &ne));
+    } else {
+        kernel = backend_ctx->kernel_mul;
+
+        CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
+        CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
+        CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
+        CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
+        CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
+        CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
+        CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
+        CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne01));
+        CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne02));
+        CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne03));
+        CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
+        CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
+        CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
+        CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03));
+        CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &ne10));
+        CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &ne11));
+        CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int),      &ne12));
+        CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int),      &ne13));
+        CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb10));
+        CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb11));
+        CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb12));
+        CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb13));
+        CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int),      &ne0));
+        CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int),      &ne1));
+        CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int),      &ne2));
+        CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int),      &ne3));
+        CL_CHECK(clSetKernelArg(kernel, 26, sizeof(cl_ulong), &nb0));
+        CL_CHECK(clSetKernelArg(kernel, 27, sizeof(cl_ulong), &nb1));
+        CL_CHECK(clSetKernelArg(kernel, 28, sizeof(cl_ulong), &nb2));
+        CL_CHECK(clSetKernelArg(kernel, 29, sizeof(cl_ulong), &nb3));
+    }
+
+    if (bcast_row) {
+        int n = ggml_nelements(dst)/4;
+        size_t global_work_size[] = {(size_t)n, 1, 1};
+        size_t local_work_size[] = {64, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+        cl_event evt;
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+        g_profiling_info.emplace_back();
+        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+    } else {
+        unsigned int nth = MIN(64, ne0);
+        size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
+        size_t local_work_size[] = {nth, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+        cl_event evt;
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+        g_profiling_info.emplace_back();
+        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+    }
+}
+
+static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+
+    UNUSED(src1);
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+    cl_command_queue queue = backend_ctx->queue;
+
+    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+    cl_ulong offset0 = extra0->offset + src0->view_offs;
+    cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+    cl_kernel kernel;
+
+    int n = ggml_nelements(dst);
+
+    if (n % 4 == 0) {
+        kernel = backend_ctx->kernel_gelu_4;
+        n /= 4;
+    } else {
+        kernel = backend_ctx->kernel_gelu;
+    }
+
+    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extrad->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
+
+    size_t global_work_size[] = {(size_t)n, 1, 1};
+    size_t local_work_size[] = {64, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+    cl_event evt;
+    clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt);
+
+    g_profiling_info.emplace_back();
+    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+    clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL);
+#endif
+}
+
+static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+
+    UNUSED(src1);
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+    cl_command_queue queue = backend_ctx->queue;
+
+    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+    cl_ulong offset0 = extra0->offset + src0->view_offs;
+    cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+    cl_kernel kernel;
+
+    int n = ggml_nelements(dst);
+
+    if (n % 4 == 0) {
+        kernel = backend_ctx->kernel_silu_4;
+        n /= 4;
+    } else {
+        kernel = backend_ctx->kernel_silu;
+    }
+
+    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extrad->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
+
+    size_t global_work_size[] = {(size_t)n, 1, 1};
+    size_t local_work_size[] = {64, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+    cl_event evt;
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+    g_profiling_info.emplace_back();
+    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+}
+
+static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+
+    UNUSED(src1);
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+    cl_command_queue queue = backend_ctx->queue;
+
+    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+    cl_ulong offset0 = extra0->offset + src0->view_offs;
+    cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+    cl_kernel kernel = backend_ctx->kernel_relu;
+
+    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extrad->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
+
+    const int64_t n = ggml_nelements(dst);
+
+    size_t global_work_size[] = {(size_t)n, 1, 1};
+    size_t local_work_size[] = {64, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+    cl_event evt;
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+    g_profiling_info.emplace_back();
+    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+}
+
+static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+
+    UNUSED(src1);
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+    cl_command_queue queue = backend_ctx->queue;
+
+    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+    cl_ulong offset0 = extra0->offset + src0->view_offs;
+    cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+    float min;
+    float max;
+    memcpy(&min, ((int32_t *) dst->op_params) + 0, sizeof(float));
+    memcpy(&max, ((int32_t *) dst->op_params) + 1, sizeof(float));
+
+    cl_kernel kernel = backend_ctx->kernel_clamp;
+
+    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extrad->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
+    CL_CHECK(clSetKernelArg(kernel, 4, sizeof(float),    &min));
+    CL_CHECK(clSetKernelArg(kernel, 5, sizeof(float),    &max));
+
+    const int64_t n = ggml_nelements(dst);
+
+    size_t global_work_size[] = {(size_t)n, 1, 1};
+    size_t local_work_size[] = {64, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+    cl_event evt;
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+    g_profiling_info.emplace_back();
+    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+}
+
+static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+
+    UNUSED(src1);
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+    cl_command_queue queue = backend_ctx->queue;
+
+    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+    cl_ulong offset0 = extra0->offset + src0->view_offs;
+    cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+    float eps;
+    memcpy(&eps, dst->op_params, sizeof(float));
+
+    const int ne00 = src0 ? src0->ne[0] : 0;
+    const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
+
+    GGML_ASSERT(ggml_is_contiguous_1(src0));
+
+    const int nth = MIN(64, ne00);
+
+    cl_kernel kernel = backend_ctx->kernel_norm;
+
+    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),    &extra0->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong),  &offset0));
+    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),    &extrad->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong),  &offsetd));
+    CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int),       &ne00));
+    CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong),  &nb01));
+    CL_CHECK(clSetKernelArg(kernel, 6, sizeof(float),     &eps));
+    CL_CHECK(clSetKernelArg(kernel, 7, sizeof(float)*nth, NULL));
+
+    const int64_t nrows = ggml_nrows(src0);
+
+    size_t global_work_size[] = {(size_t)nrows*nth, 1, 1};
+    size_t local_work_size[] = {(size_t)nth, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+    cl_event evt;
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+    g_profiling_info.emplace_back();
+    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+}
+
+static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+
+    UNUSED(src1);
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+    cl_command_queue queue = backend_ctx->queue;
+
+    ggml_backend_opencl_device_context * dev_ctx =
+        (ggml_backend_opencl_device_context *)backend->device->context;
+
+    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+    cl_ulong offset0 = extra0->offset + src0->view_offs;
+    cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+    float eps;
+    memcpy(&eps, dst->op_params, sizeof(float));
+
+    const int ne00 = src0 ? src0->ne[0] : 0;
+    const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
+
+    GGML_ASSERT(ne00 % 4 == 0);
+    GGML_ASSERT(ggml_is_contiguous_1(src0));
+
+    const int nth = MIN(64, ne00);
+
+    const int64_t nrows = ggml_nrows(src0);
+
+    size_t global_work_size[] = {(size_t)nrows*nth, 1, 1};
+    size_t local_work_size[] = {(size_t)nth, 1, 1};
+
+    cl_kernel kernel = backend_ctx->kernel_rms_norm;
+
+    // Note, this kernel declares local memory in kernel args and the size
+    // depends on subgroup size.
+    // Retrieve subgroup size.
+    // Note, this requires OpenCL 2.1 and above
+    size_t sgs;
+    CL_CHECK(clGetKernelSubGroupInfo(kernel, dev_ctx->device,
+        CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
+        sizeof(local_work_size), local_work_size,
+        sizeof(size_t), &sgs, NULL));
+
+    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),    &extra0->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong),  &offset0));
+    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),    &extrad->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong),  &offsetd));
+    CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int),       &ne00));
+    CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong),  &nb01));
+    CL_CHECK(clSetKernelArg(kernel, 6, sizeof(float),     &eps));
+    // This is local memory - the size depends on subgroup size.
+    CL_CHECK(clSetKernelArg(kernel, 7, sizeof(float)*nth/sgs,  NULL));
+
+#ifdef GGML_OPENCL_PROFILING
+    cl_event evt;
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+    g_profiling_info.emplace_back();
+    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+}
+
+static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(src1);
+    GGML_ASSERT(src1->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+
+    const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
+    const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+    cl_command_queue queue = backend_ctx->queue;
+
+    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+    cl_ulong offset0 = extra0->offset + src0->view_offs;
+    cl_ulong offset1 = extra1->offset + src1->view_offs;
+    cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+#ifdef GGML_OPENCL_SOA_Q
+    ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra;
+#endif
+
+    const int  ne00 = src0 ? src0->ne[0] : 0;
+    const int  ne01 = src0 ? src0->ne[1] : 0;
+    const int  ne02 = src0 ? src0->ne[2] : 0;
+    const int  ne03 = src0 ? src0->ne[3] : 0;
+
+    const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
+    const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
+    const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
+    const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
+
+    const int  ne10 = src1 ? src1->ne[0] : 0;
+    const int  ne11 = src1 ? src1->ne[1] : 0;
+    const int  ne12 = src1 ? src1->ne[2] : 0;
+    const int  ne13 = src1 ? src1->ne[3] : 0;
+
+    const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
+    const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
+    const cl_ulong nb12 = src1 ? src1->nb[2] : 0;
+    const cl_ulong nb13 = src1 ? src1->nb[3] : 0;
+
+    const int  ne0 = dst ? dst->ne[0] : 0;
+    const int  ne1 = dst ? dst->ne[1] : 0;
+
+    int r2 = ne12/ne02;
+    int r3 = ne13/ne03;
+
+    GGML_ASSERT(ne00 == ne10);
+
+    int nth0 = 32;
+    int nth1 = 1;
+    int nrows = 1;
+    // The number of values produced by each subgroup
+    int ndst = 4;
+
+    cl_kernel kernel;
+
+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+    cl_context context = backend_ctx->context;
+
+    if (ne01 && ne1 && use_adreno_kernels(src0)) {
+
+    // init CL objects
+    // <--------------------------------------------> //
+    cl_int              status;
+    cl_image_format     img_fmt_1d;
+    cl_image_desc       img_desc_1d;
+    cl_buffer_region    region;
+    cl_mem              A_image1d;
+    cl_mem              B_image1d;
+    cl_mem              B_sub_buffer;
+    cl_mem              C_d;
+    // for B transpose
+    cl_mem B_d;
+    cl_mem B_d_input_image;
+    // <--------------------------------------------> //
+
+    // define matrix dimensions
+    // <--------------------------------------------> //
+    int M = ne01;
+    int N = ne1;
+    int K = ne00;
+    int padding;
+    // <--------------------------------------------> //
+
+    // q4_0 x fp32
+    if(src0t == GGML_TYPE_Q4_0 && src1t == GGML_TYPE_F32) {
+        // TODO: remove duplicate definitions of image description + format -- move to top
+
+        // create an image for A
+        // <--------------------------------------------> //
+        if (N == 1) {
+            img_fmt_1d = { CL_R, CL_UNSIGNED_INT32};
+        } else {
+            img_fmt_1d = { CL_R, CL_FLOAT};
+        }
+        memset(&img_desc_1d, 0, sizeof(img_desc_1d));
+        img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+        img_desc_1d.image_width = M * K / 2 / 4;    // Divide by 4 for char -> float
+        img_desc_1d.buffer = extra0_q4_0->q;
+        A_image1d = clCreateImage(
+            context,
+            CL_MEM_READ_ONLY,
+            &img_fmt_1d,
+            &img_desc_1d,
+            NULL,
+            &status);
+        CL_CHECK(status);
+        // <--------------------------------------------> //
+
+
+        // create a sub_buffer for B
+        // <--------------------------------------------> //
+        region.origin = (extra1->offset);
+        region.size = K * N * sizeof(float);
+        B_sub_buffer = clCreateSubBuffer(
+            extra1->data_device,
+            0,
+            CL_BUFFER_CREATE_TYPE_REGION,
+            &region,
+            &status);
+        CL_CHECK(status);
+        // <--------------------------------------------> //
+
+        // transpose activation for Skyler's gemm
+        if (N != 1) {
+            //how many extra elements beyond multiple of 8
+            int extra_elements = N % 8;
+
+            //how much padding to add
+            padding = 0;
+            if (extra_elements > 0){
+                padding = 8 - extra_elements;
+            }
+
+            // Specify the starting offset (in bytes)
+            region.origin = 0;
+            // Specify the size of the sub-buffer (divide by 2 for FP16)
+            region.size = K * (N + padding) * sizeof(float)/2;
+            B_d = clCreateSubBuffer(
+                backend_ctx->B_d_max,
+                0,
+                CL_BUFFER_CREATE_TYPE_REGION,
+                &region,
+                &status);
+            CL_CHECK(status);
+
+            cl_image_format image_format_B_d_input = { CL_RGBA, CL_FLOAT };
+            cl_image_desc image_desc_B_d_input = {
+                CL_MEM_OBJECT_IMAGE1D_BUFFER,
+                static_cast<size_t>(K * N / 4),
+                0, 0, 0, 0, 0, 0, 0, { B_sub_buffer }
+            };
+            B_d_input_image = clCreateImage(
+                context,
+                0,
+                &image_format_B_d_input,
+                &image_desc_B_d_input,
+                NULL,
+                &status);
+            CL_CHECK(status);
+
+            cl_image_format image_format_B_d_output = { CL_RGBA, CL_HALF_FLOAT }; //(CL_HALF_FLOAT for FP16)
+            cl_image_desc image_desc_B_d_output = {
+                CL_MEM_OBJECT_IMAGE1D_BUFFER,
+                static_cast<size_t>(K * (N + padding)/4),
+                0, 0, 0, 0, 0, 0, 0, { B_d }
+            };
+            B_image1d = clCreateImage(
+                context,
+                0,
+                &image_format_B_d_output,
+                &image_desc_B_d_output,
+                NULL,
+                &status);
+            CL_CHECK(status);
+
+            int height_B = N/4;
+            int width_B = K/4;
+            int padded_height_B = (N + padding)/4;
+
+            kernel = backend_ctx->kernel_transpose_32_16;
+            CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &B_d_input_image));
+            CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &B_image1d));
+            CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int),    &height_B));
+            CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int),    &width_B));
+            CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int),    &padded_height_B));
+
+            size_t local_size_t[2] = { 1, 16 };
+            //WGS tuning
+            if (ne0 == 4096 && ne1 == 128 && ne10 == 4096) {
+                local_size_t[0]=4;
+                local_size_t[1]=8;
+            } else if (ne0 == 11008 && ne1 == 128 && ne10 == 4096) {
+                local_size_t[0]=2;
+                local_size_t[1]=8;
+            } else if(ne0 == 4096 && ne1 == 128 && ne10 == 11008) {
+                local_size_t[0]=1;
+                local_size_t[1]=8;
+            } else if(ne0 == 32000 && ne1 == 128 && ne10 == 4096) {
+                local_size_t[0]=2;
+                local_size_t[1]=8;
+            }
+
+            size_t global_size_t[2] = {
+                static_cast<size_t>(width_B),
+                static_cast<size_t>(padded_height_B)
+            };
+
+            #ifdef GGML_OPENCL_PROFILING
+                cl_event evt;
+                CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_size_t, local_size_t, 0, NULL, &evt));
+
+                g_profiling_info.emplace_back();
+                populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_size_t, local_size_t, dst);
+            #else
+                CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_size_t, local_size_t, 0, NULL, NULL));
+            #endif
+        } else {
+            // no need to transpose B in other cases
+            // create an image for B from sub_buffer
+            // <--------------------------------------------> //
+            img_fmt_1d = {CL_RGBA, CL_FLOAT};
+
+            memset(&img_desc_1d, 0, sizeof(img_desc_1d));
+            img_desc_1d.image_width = K * N / 4;
+            img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+            img_desc_1d.buffer = B_sub_buffer;
+            B_image1d = clCreateImage(
+                context,
+                CL_MEM_READ_ONLY,
+                &img_fmt_1d,
+                &img_desc_1d,
+                NULL,
+                &status);
+            CL_CHECK(status);
+            // <--------------------------------------------> //
+        }
+
+        // choose gemm or gemv kernel
+        // <--------------------------------------------> //
+        if (N == 1) {
+            kernel = backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general;
+            if (M == 4096 && K == 4096) {
+                kernel = backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096;
+            } else if (M == 4096 && K == 11008) {
+                kernel = backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008;
+            } else if (M == 11008 && K == 4096) {
+                kernel = backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096;
+            } else if (M == 32000 && K == 4096) {
+                kernel = backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096;
+            }
+        } else {
+            kernel = backend_ctx->CL_mul_mat_Ab_Bi_8x4;
+        }
+        // <--------------------------------------------> //
+
+        // set kernel args
+        // <--------------------------------------------> //
+        cl_uint k_arg = 0;
+
+        if (N == 1) {
+            CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(cl_mem),   &A_image1d));
+            CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(cl_mem),   &extra0_q4_0->d));
+            CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(cl_mem),   &B_image1d));
+            CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(cl_ulong), &extra1->offset));
+            CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(cl_mem),   &extrad->data_device));
+            CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(cl_ulong), &extrad->offset));
+            CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),      &ne00));
+            CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),      &ne01));
+            CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),      &ne02));
+            CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),      &ne10));
+            CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),      &ne12));
+            CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),      &ne0));
+            CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),      &ne1));
+            CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),      &r2));
+            CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),      &r3));
+        } else {
+            region.origin = extrad->offset; // Specify the starting offset (in bytes)
+            region.size = M * N * sizeof(float); // Specify the size of the sub-buffer
+            C_d = clCreateSubBuffer(extrad->data_device, CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
+            CL_CHECK(status);
+
+            int padded_N = ne1 + padding;
+
+            CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q)); //A_q_dextra0_q4_0->q
+            CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_0->d)); //A_s_d
+            CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &B_image1d)); //B_d
+            CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &C_d)); //C_d
+            CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int),    &ne01)); //M
+            CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int),    &padded_N)); //N with padding
+            CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),    &ne00)); //K
+            CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int),    &ne1)); //N without padding
+        }
+        // <--------------------------------------------> //
+
+        // choose workgroup size
+        // <--------------------------------------------> //
+        size_t global_work_size[3] = {
+            64, static_cast<size_t>((M+63)/64), static_cast<size_t>((N+31)/32)};
+        size_t local_work_size[3] = {64, 2, 4};
+
+        global_work_size[0] = (size_t)(ceil((float)ne1/8));
+        global_work_size[1] = (size_t)(ne01/4);
+        global_work_size[2] = (size_t)(1);
+
+        local_work_size[0]  = (size_t)(1); //4x32 for FP32
+        local_work_size[1]  = (size_t)(128);
+        local_work_size[2]  = (size_t)(1);
+
+        //WGS tuning
+        if (ne0 == 4096 && ne1 == 128 && ne10 == 4096) {
+            local_work_size[0] = 1;
+            local_work_size[1] = 128;
+        } else if (ne0 == 11008 && ne1 == 128 && ne10 == 4096) {
+            local_work_size[0] = 2;
+            local_work_size[1] = 64;
+        } else if (ne0 == 4096 && ne1 == 128 && ne10 == 11008) {
+            local_work_size[0] = 2;
+            local_work_size[1] = 64;
+        } else if (ne0 == 32000 && ne1 == 128 && ne10 == 4096) {
+            local_work_size[0] = 2;
+            local_work_size[1] = 64;
+        }
+
+        if (N == 1) {
+            local_work_size[0] = backend_ctx->adreno_wave_size; // localsize
+            local_work_size[1] = 4; // reduce factor
+            local_work_size[2] = 1;
+
+            global_work_size[0] = M / 2;
+            global_work_size[1] = 4; // reduce factor
+            global_work_size[2] = 1;
+        }
+        // <--------------------------------------------> //
+
+        // enqueue kernel with profiling
+        // <--------------------------------------------> //
+    #ifdef GGML_OPENCL_PROFILING
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+        g_profiling_info.emplace_back();
+        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+        // enqueue kernel without profiling
+    #else
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+    #endif
+        // <--------------------------------------------> //
+
+        // deallocate sub buffers and images
+        // <--------------------------------------------> //
+        CL_CHECK(clReleaseMemObject(A_image1d));
+        CL_CHECK(clReleaseMemObject(B_sub_buffer));
+        CL_CHECK(clReleaseMemObject(B_image1d));
+
+        if (N != 1) {
+            CL_CHECK(clReleaseMemObject(B_d));
+            CL_CHECK(clReleaseMemObject(B_d_input_image));
+            CL_CHECK(clReleaseMemObject(C_d));
+        }
+        // <--------------------------------------------> //
+
+        return;
+    }
+    } // if (ne01 && ne1)
+#endif // GGML_OPENCL_USE_ADRENO_KERNELS
+
+    if (!ggml_is_transposed(src0) &&
+        !ggml_is_transposed(src1) &&
+        src1t == GGML_TYPE_F32 &&
+        ne00%32 == 0 &&
+        ne11 > 2) {
+#ifdef GGML_OPENCL_SOA_Q
+        // Set up kernel.
+        switch(src0t) {
+            case GGML_TYPE_Q4_0:
+                // This should have been satisfied.
+                GGML_ASSERT(ne11 == ne1);
+                GGML_ASSERT(ne01 == ne0);
+
+                if (backend_ctx->gpu_family == INTEL) {
+                    nth0 = 16;
+                    nth1 = 1;
+
+                    kernel = backend_ctx->kernel_mul_mat_q4_0_f32_1d_16x_flat;
+                } else if (backend_ctx->gpu_family == ADRENO) {
+                    nth0 = 64;
+                    nth1 = 1;
+
+                    kernel = backend_ctx->kernel_mul_mat_q4_0_f32_1d_8x_flat;
+                } else {
+                    GGML_ASSERT(false && "TODO: Unknown GPU");
+                }
+
+                CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0_q4_0->q));
+                CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_mem),   &extra0_q4_0->d));
+                CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
+                CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
+                CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
+                CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
+                CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
+                CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne01));
+                CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne02));
+                CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne10));
+                CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne12));
+                CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne0));
+                CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne1));
+                CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &r2));
+                CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &r3));
+                break;
+            default:
+                break;
+        }
+
+        // Launch kernel.
+        if (src0t == GGML_TYPE_Q4_0) {
+            size_t global_work_size[] = {(size_t)(ne01 + 7)/8*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
+            size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
+
+            if (backend_ctx->gpu_family == INTEL) {
+                // Set global size for Intel. It uses 16x output values.
+                global_work_size[0] = (size_t)(ne01 + 15)/16*nth0;
+                global_work_size[1] = (size_t)ne11*nth1;
+                global_work_size[2] = (size_t)ne12*ne13;
+            }
+
+#ifdef GGML_OPENCL_PROFILING
+            cl_event evt;
+            CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+            g_profiling_info.emplace_back();
+            populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+            CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+            return;
+        }
+#else // GGML_OPENCL_SOA_Q
+        // TODO: add block_q4_0 variant.
+#endif // GGML_OPENCL_SOA_Q
+    }
+
+    // use custom matrix x vector kernel
+    switch (src0t) {
+        case GGML_TYPE_F32:
+            //GGML_ASSERT(ne02 == ne12);
+            GGML_ASSERT(src1t == GGML_TYPE_F32);
+            kernel = backend_ctx->kernel_mul_mat_f32_f32;
+            nrows = 4;
+
+            if (backend_ctx->gpu_family == INTEL) {
+                nth0 = 32;
+                nth1 = 1;
+            } else if (backend_ctx->gpu_family == ADRENO) {
+                nth0 = 64;
+                nth1 = 1;
+            } else {
+                GGML_ASSERT(false && "TODO: Unknown GPU");
+            }
+
+            CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
+            CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
+            CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
+            CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
+            CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
+            CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
+            CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
+            CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne01));
+            CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne02));
+            CL_CHECK(clSetKernelArg(kernel,  9, sizeof(cl_ulong), &nb00));
+            CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb01));
+            CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb02));
+            CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb03));
+            CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &ne10));
+            CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &ne11));
+            CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &ne12));
+            CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb10));
+            CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb11));
+            CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb12));
+            CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb13));
+            CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int),      &ne0));
+            CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int),      &ne1));
+            CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int),      &r2));
+            CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int),      &r3));
+            break;
+        case GGML_TYPE_F16:
+            //GGML_ASSERT(ne02 == ne12);
+            if (backend_ctx->gpu_family == INTEL) {
+                nth0 = 32;
+                nth1 = 1;
+            } else if (backend_ctx->gpu_family == ADRENO) {
+                nth0 = 64;
+                nth1 = 1;
+            } else {
+                GGML_ASSERT(false && "TODO: Unknown GPU");
+            }
+
+            if (src1t == GGML_TYPE_F32) {
+                if (ne11 * ne12 < 4) {
+                    kernel = backend_ctx->kernel_mul_mat_f16_f32_1row;
+                } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
+                    kernel = backend_ctx->kernel_mul_mat_f16_f32_l4;
+                    nrows = ne11;
+                } else {
+                    kernel = backend_ctx->kernel_mul_mat_f16_f32;
+                    nrows = 4;
+                }
+            } else {
+                kernel = backend_ctx->kernel_mul_mat_f16_f16;
+                nrows = 4;
+            }
+
+            CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
+            CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
+            CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
+            CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
+            CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
+            CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
+            CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
+            CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne01));
+            CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne02));
+            CL_CHECK(clSetKernelArg(kernel,  9, sizeof(cl_ulong), &nb00));
+            CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb01));
+            CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb02));
+            CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb03));
+            CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &ne10));
+            CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &ne11));
+            CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &ne12));
+            CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb10));
+            CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb11));
+            CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb12));
+            CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb13));
+            CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int),      &ne0));
+            CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int),      &ne1));
+            CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int),      &r2));
+            CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int),      &r3));
+            break;
+        case GGML_TYPE_Q4_0:
+            // This should have been satisfied.
+            GGML_ASSERT(ne11 == ne1);
+            GGML_ASSERT(ne01 == ne0);
+
+#ifdef GGML_OPENCL_SOA_Q
+            if (backend_ctx->gpu_family == INTEL) {
+                nth0 = 16;
+                nth1 = 1;
+
+                kernel = backend_ctx->kernel_mul_mat_q4_0_f32_8x_flat;
+                ndst = 8;
+            } else if (backend_ctx->gpu_family == ADRENO) {
+                nth0 = 64;
+                nth1 = 1;
+
+                kernel = backend_ctx->kernel_mul_mat_q4_0_f32_8x_flat;
+                ndst =8;
+            } else {
+                GGML_ASSERT(false && "TODO: Unknown GPU");
+            }
+
+            CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0_q4_0->q));
+            CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_mem),   &extra0_q4_0->d));
+            CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
+            CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
+            CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
+            CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
+            CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
+            CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne01));
+            CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne02));
+            CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne10));
+            CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne12));
+            CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne0));
+            CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne1));
+            CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &r2));
+            CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &r3));
+#else // GGML_OPENCL_SOA_Q
+            if (backend_ctx->gpu_family == INTEL) {
+                // Use 1D local size. Each workgroup is a SIMD group. Each SIMD
+                // group produces N_DST (4 for Q4_0 kernel) values in the result.
+                // The number of workgroups on dim 0 (the leading dimension) is
+                // the nearest multiple of 4 that covers ne0 (equals ne01).
+                nth0 = 16;
+                nth1 = 1;
+
+                kernel = backend_ctx->kernel_mul_mat_q4_0_f32;
+                ndst = 4;
+            } else if (backend_ctx->gpu_family == ADRENO) {
+                nth0 = 64;
+                nth1 = 1;
+
+                kernel = backend_ctx->kernel_mul_mat_q4_0_f32_v;
+                ndst = 4;
+            } else {
+                GGML_ASSERT(false && "TODO: Unknown GPU");
+            }
+
+            CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
+            CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
+            CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
+            CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
+            CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
+            CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
+            CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
+            CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne01));
+            CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne02));
+            CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne10));
+            CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne12));
+            CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne0));
+            CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne1));
+            CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &r2));
+            CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &r3));
+#endif // GGML_OPENCL_SOA_Q
+            break;
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+        case GGML_TYPE_Q6_K:
+            kernel = backend_ctx->kernel_mul_mv_q6_K_f32;
+
+            if (backend_ctx->gpu_family == INTEL) {
+                nth0 = 2;
+                nth1 = 16;
+            } else if (backend_ctx->gpu_family == ADRENO) {
+                nth0 = 2;
+                nth1 = 64;
+            } else {
+                GGML_ASSERT(false && "TODO: Unknown GPU");
+            }
+
+            CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
+            CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
+            CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
+            CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
+            CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
+            CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
+            CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
+            CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne01));
+            CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne02));
+            CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne10));
+            CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne12));
+            CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne0));
+            CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne1));
+            CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &r2));
+            CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &r3));
+            break;
+        default:
+            GGML_ASSERT(false && "not implemented");
+    }
+
+    if (src0t == GGML_TYPE_Q4_0 ||
+        src0t == GGML_TYPE_Q4_1 ||
+        src0t == GGML_TYPE_Q8_0 ||
+        src0t == GGML_TYPE_Q2_K) {
+        // Each SIMD group produces N_DST values in the result. Assuming each
+        // workgroup has N_SIMDGROUP SIMD groups, then each workgroup will
+        // produce N_DST*N_SIMDGROUP values in the result. Hence, the grid size
+        // (number of workgroups) will be a nearest multiple of
+        // N_DST*N_SIMDGROUP to cover the size of the dimension. Below, 4 is
+        // N_DST*N_SIMDGROUP (see the kernel for Q4_0 matmul).
+        size_t global_work_size[] = {(size_t)(ne01 + ndst-1)/ndst*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
+        size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+        cl_event evt;
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+        g_profiling_info.emplace_back();
+        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+    } else if (src0t == GGML_TYPE_Q4_K) {
+        GGML_ASSERT(false && "not implemented");
+    } else if (src0t == GGML_TYPE_Q3_K) {
+        GGML_ASSERT(false && "not implemented");
+    } else if (src0t == GGML_TYPE_Q5_K) {
+        GGML_ASSERT(false && "not implemented");
+    } else if (src0t == GGML_TYPE_Q6_K) {
+        size_t global_work_size[] = {(size_t)(ne01+1)/2*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
+        size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+        cl_event evt;
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+        g_profiling_info.emplace_back();
+        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+    } else {
+        int64_t ny = (ne11 + nrows - 1)/nrows;
+
+        size_t global_work_size[] = {(size_t)ne01*nth0, (size_t)ny*nth1, (size_t)ne12*ne13};
+        size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+        cl_event evt;
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+        g_profiling_info.emplace_back();
+        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+    }
+}
+
+static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+    GGML_UNUSED(src1);
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+    cl_command_queue queue = backend_ctx->queue;
+
+    float scale;
+    memcpy(&scale, dst->op_params, sizeof(scale));
+
+    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+    cl_ulong offset0 = extra0->offset + src0->view_offs;
+    cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+    cl_kernel kernel = backend_ctx->kernel_scale;
+
+    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extrad->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
+    CL_CHECK(clSetKernelArg(kernel, 4, sizeof(float),    &scale));
+
+    int n = ggml_nelements(dst)/4;
+
+    size_t global_work_size[] = {(size_t)n, 1, 1};
+    size_t local_work_size[] = {64, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+    cl_event evt;
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+    g_profiling_info.emplace_back();
+    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+}
+
+static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(src1);
+    GGML_ASSERT(src1->extra);
+
+    // GGML_OP_CPY happens between src0 and src1.
+    // GGML_OP_DUP and GGML_OP_CONT happen between src0 and dst.
+    UNUSED(dst);
+
+    const int ne00 = src0 ? src0->ne[0] : 0;
+    const int ne01 = src0 ? src0->ne[1] : 0;
+    const int ne02 = src0 ? src0->ne[2] : 0;
+    const int ne03 = src0 ? src0->ne[3] : 0;
+
+    const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
+    const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
+    const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
+    const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
+
+    const int ne10 = src1 ? src1->ne[0] : 0;
+    const int ne11 = src1 ? src1->ne[1] : 0;
+    const int ne12 = src1 ? src1->ne[2] : 0;
+    const int ne13 = src1 ? src1->ne[3] : 0;
+
+    const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
+    const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
+    const cl_ulong nb12 = src1 ? src1->nb[2] : 0;
+    const cl_ulong nb13 = src1 ? src1->nb[3] : 0;
+
+    const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
+    const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+    cl_command_queue queue = backend_ctx->queue;
+
+    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
+
+    cl_ulong offset0 = extra0->offset + src0->view_offs;
+    cl_ulong offset1 = extra1->offset + src1->view_offs;
+
+    cl_kernel kernel;
+
+    switch (src0t) {
+        case GGML_TYPE_F32:
+            switch (src1t) {
+                case GGML_TYPE_F16:
+                    kernel = backend_ctx->kernel_cpy_f32_f16;
+                    break;
+                case GGML_TYPE_F32:
+                    kernel = backend_ctx->kernel_cpy_f32_f32;
+                    break;
+                default:
+                    GGML_ASSERT(false && "not implemented");
+            }
+            break;
+        case GGML_TYPE_F16:
+            switch (src1t) {
+                case GGML_TYPE_F16:
+                    kernel = backend_ctx->kernel_cpy_f16_f16;
+                    break;
+                case GGML_TYPE_F32:
+                    kernel = backend_ctx->kernel_cpy_f16_f32;
+                    break;
+                default:
+                    GGML_ASSERT(false && "not implemented");
+            }
+            break;
+        default:
+            GGML_ASSERT(false && "not implemented");
+    }
+
+    CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
+    CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
+    CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
+    CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
+    CL_CHECK(clSetKernelArg(kernel,  4, sizeof(int),      &ne00));
+    CL_CHECK(clSetKernelArg(kernel,  5, sizeof(int),      &ne01));
+    CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne02));
+    CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne03));
+    CL_CHECK(clSetKernelArg(kernel,  8, sizeof(cl_ulong), &nb00));
+    CL_CHECK(clSetKernelArg(kernel,  9, sizeof(cl_ulong), &nb01));
+    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb02));
+    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb03));
+    CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne10));
+    CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &ne11));
+    CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &ne12));
+    CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &ne13));
+    CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb10));
+    CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb11));
+    CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb12));
+    CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb13));
+
+    const int nth = MIN(64, ne00);
+
+    size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
+    size_t local_work_size[] = {(size_t)nth, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+    cl_event evt;
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+    g_profiling_info.emplace_back();
+    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, src1);
+#else
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+}
+
+static void ggml_cl_dup(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cl_cpy(backend, src0, dst, nullptr);
+    UNUSED(src1);
+}
+
+static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+
+    UNUSED(src1);
+
+    int n_past = ((int32_t *)(dst->op_params))[0];
+
+    const int  ne00 = src0 ? src0->ne[0] : 0;
+    const int  ne01 = src0 ? src0->ne[1] : 0;
+    const int  ne02 = src0 ? src0->ne[2] : 0;
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+    cl_command_queue queue = backend_ctx->queue;
+
+    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+    cl_ulong offset0 = extra0->offset + src0->view_offs;
+    cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+    cl_kernel kernel;
+
+    if (ne00%8 == 0) {
+        kernel = backend_ctx->kernel_diag_mask_inf_8;
+
+        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
+        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extrad->data_device));
+        CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
+        CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int),      &ne00));
+        CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int),      &ne01));
+        CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),      &n_past));
+
+        size_t global_work_size[] = {(size_t)ne00*ne01*ne02/8, 1, 1};
+        size_t local_work_size[] = {64, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+        cl_event evt;
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+        g_profiling_info.emplace_back();
+        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+    } else {
+        kernel = backend_ctx->kernel_diag_mask_inf;
+
+        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
+        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extrad->data_device));
+        CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
+        CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int),      &ne00));
+        CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int),      &ne01));
+        CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),      &n_past));
+
+        size_t global_work_size[] = {(size_t)ne00, (size_t)ne01, (size_t)ne02};
+        size_t local_work_size[] = {64, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+        cl_event evt;
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+        g_profiling_info.emplace_back();
+        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+    }
+}
+
+static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+
+    // Softmax can now fuse KQ mask and KQ scale, which used to be two additional
+    // ops before softmax. It now also fuses alibi if `max_bias > 0`. For llama,
+    // alibi is not used; however, for some other models, it is used.
+    // KQ_mask
+    if (src1) {
+        GGML_ASSERT(src1);
+        GGML_ASSERT(src1->extra);
+    }
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+    cl_command_queue queue = backend_ctx->queue;
+
+    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+    ggml_tensor_extra_cl * extra1 = src1 ? (ggml_tensor_extra_cl *)src1->extra : nullptr;
+
+    cl_ulong offset0 = extra0->offset + src0->view_offs;
+    cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+    cl_ulong offset1 = extra1 ? extra1->offset + src1->view_offs : offset0;
+
+    const int  ne00 = src0 ? src0->ne[0] : 0;
+    const int  ne01 = src0 ? src0->ne[1] : 0;
+    const int  ne02 = src0 ? src0->ne[2] : 0;
+    const int  ne03 = src0 ? src0->ne[3] : 0;
+
+    float scale, max_bias;
+    memcpy(&scale,    dst->op_params + 0, sizeof(float));
+    memcpy(&max_bias, dst->op_params + 1, sizeof(float));
+
+    const int nrows_x = ggml_nrows(src0);
+    const int nrows_y = src0->ne[1];
+
+    const int n_head      = nrows_x/nrows_y;
+    const int n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
+
+    const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
+    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
+
+    // Local size must be wave size. Each workgroup is a wave, working on a row,
+    // where a row corresponds to leading dimension.
+    int nth = MIN(32, ne00);
+
+    if (backend_ctx->gpu_family == INTEL) {
+        // This is the same as the initial value.
+        nth = MIN(32, ne00);
+    }
+    else if (backend_ctx->gpu_family == ADRENO) {
+        nth = 64;
+    } else {
+        GGML_ASSERT(false && "TODO: Unknown GPU");
+    }
+
+    cl_kernel kernel;
+
+    if (ne00%4 == 0) {
+        kernel = backend_ctx->kernel_soft_max_4;
+    } else {
+        kernel = backend_ctx->kernel_soft_max;
+    }
+
+    CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
+    CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
+    CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   extra1 ? &extra1->data_device : &extra0->data_device));
+    CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
+    CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
+    CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
+    CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
+    CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne01));
+    CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne02));
+    CL_CHECK(clSetKernelArg(kernel,  9, sizeof(float),    &scale));
+    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(float),    &max_bias));
+    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(float),    &m0));
+    CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float),    &m1));
+    CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &n_head_log2));
+
+    size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
+    size_t local_work_size[] = {(size_t)nth, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+    cl_event evt;
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+    g_profiling_info.emplace_back();
+    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+}
+
+static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(src1);
+    GGML_ASSERT(src1->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+    cl_command_queue queue = backend_ctx->queue;
+
+    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+    cl_ulong offset0 = extra0->offset + src0->view_offs;
+    cl_ulong offset1 = extra1->offset + src1->view_offs;
+    cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+    ggml_tensor * src2 = dst->src[2];
+    ggml_tensor_extra_cl * extra2 = src2 ? (ggml_tensor_extra_cl *)src2->extra : nullptr;
+
+    cl_ulong offset2 = extra2 ? extra2->offset + src2->view_offs : offset0;
+
+    const int  ne00 = src0 ? src0->ne[0] : 0;
+    const int  ne01 = src0 ? src0->ne[1] : 0;
+    const int  ne02 = src0 ? src0->ne[2] : 0;
+    const int  ne03 = src0 ? src0->ne[3] : 0;
+
+    const int  nb00 = src0 ? src0->nb[0] : 0;
+    const int  nb01 = src0 ? src0->nb[1] : 0;
+    const int  nb02 = src0 ? src0->nb[2] : 0;
+    const int  nb03 = src0 ? src0->nb[3] : 0;
+
+    const int ne10 = src1 ? src1->ne[0] : 0;
+    const int ne11 = src1 ? src1->ne[1] : 0; UNUSED(ne11);
+    const int ne12 = src1 ? src1->ne[2] : 0; UNUSED(ne12);
+    const int ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
+
+    const int  ne0 = dst ? dst->ne[0] : 0;
+    const int  ne1 = dst ? dst->ne[1] : 0;
+    const int  ne2 = dst ? dst->ne[2] : 0;
+    const int  ne3 = dst ? dst->ne[3] : 0;
+
+    const int  nb0 = dst ? dst->nb[0] : 0;
+    const int  nb1 = dst ? dst->nb[1] : 0;
+    const int  nb2 = dst ? dst->nb[2] : 0;
+    const int  nb3 = dst ? dst->nb[3] : 0;
+
+    GGML_ASSERT(ne10 == ne02);
+
+    int nth = MIN(64, ne00);
+
+    const int n_past     = ((int *) dst->op_params)[0];
+    const int n_dims     = ((int *) dst->op_params)[1];
+    const int mode       = ((int *) dst->op_params)[2];
+    const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
+
+    float freq_base;
+    float freq_scale;
+    float ext_factor;
+    float attn_factor;
+    float beta_fast;
+    float beta_slow;
+
+    memcpy(&freq_base,   (int32_t *) dst->op_params + 5, sizeof(float));
+    memcpy(&freq_scale,  (int32_t *) dst->op_params + 6, sizeof(float));
+    memcpy(&ext_factor,  (int32_t *) dst->op_params + 7, sizeof(float));
+    memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
+    memcpy(&beta_fast,   (int32_t *) dst->op_params + 9, sizeof(float));
+    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
+
+    const bool is_neox = mode & 2;
+
+    cl_kernel kernel;
+
+    if (!is_neox) {
+        switch (src0->type) {
+            case GGML_TYPE_F32:
+                kernel = backend_ctx->kernel_rope_norm_f32;
+                break;
+            case GGML_TYPE_F16:
+                kernel = backend_ctx->kernel_rope_norm_f16;
+                break;
+            default:
+                GGML_ASSERT(false);
+        };
+    } else {
+        switch (src0->type) {
+            case GGML_TYPE_F32:
+                kernel = backend_ctx->kernel_rope_neox_f32;
+                break;
+            case GGML_TYPE_F16:
+                kernel = backend_ctx->kernel_rope_neox_f16;
+                break;
+            default:
+                GGML_ASSERT(false);
+        };
+    }
+
+    CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
+    CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
+    CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
+    CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
+    CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   extra2 ? &extra2->data_device : &extra0->data_device));
+    CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offset2));
+    CL_CHECK(clSetKernelArg(kernel,  6, sizeof(cl_mem),   &extrad->data_device));
+    CL_CHECK(clSetKernelArg(kernel,  7, sizeof(cl_ulong), &offsetd));
+    CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne00));
+    CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne01));
+    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne02));
+    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne03));
+    CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb00));
+    CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb01));
+    CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb02));
+    CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb03));
+    CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int),      &ne0));
+    CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int),      &ne1));
+    CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int),      &ne2));
+    CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int),      &ne3));
+    CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb0));
+    CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb1));
+    CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb2));
+    CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_ulong), &nb3));
+    CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int),      &n_past));
+    CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int),      &n_dims));
+    CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int),      &n_ctx_orig));
+    CL_CHECK(clSetKernelArg(kernel, 27, sizeof(float),    &freq_base));
+    CL_CHECK(clSetKernelArg(kernel, 28, sizeof(float),    &freq_scale));
+    CL_CHECK(clSetKernelArg(kernel, 29, sizeof(float),    &ext_factor));
+    CL_CHECK(clSetKernelArg(kernel, 30, sizeof(float),    &attn_factor));
+    CL_CHECK(clSetKernelArg(kernel, 31, sizeof(float),    &beta_fast));
+    CL_CHECK(clSetKernelArg(kernel, 32, sizeof(float),    &beta_slow));
+
+    size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
+    size_t local_work_size[] = {(size_t)nth, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+    cl_event evt;
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+    g_profiling_info.emplace_back();
+    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Op offloading
+//------------------------------------------------------------------------------
+
+typedef void (*ggml_cl_func_t)(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+
+bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor) {
+    ggml_cl_func_t func = nullptr;
+
+    ggml_tensor * src0 = tensor->src[0];
+    ggml_tensor * src1 = tensor->src[1];
+
+    const bool any_on_device = tensor->extra
+        || (src0 != nullptr && src0->extra)
+        || (src1 != nullptr && src1->extra);
+
+    switch (tensor->op) {
+        case GGML_OP_GET_ROWS:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cl_get_rows;
+            break;
+        case GGML_OP_CPY:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cl_cpy;
+            break;
+        case GGML_OP_DUP:
+        case GGML_OP_CONT:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cl_dup;
+            break;
+        case GGML_OP_ADD:
+            if (!any_on_device) {
+                return false;
+            }
+            GGML_ASSERT(ggml_is_contiguous(src0));
+            GGML_ASSERT(ggml_is_contiguous(src1));
+            func = ggml_cl_add;
+            break;
+        case GGML_OP_MUL:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cl_mul;
+            break;
+        case GGML_OP_UNARY:
+            switch (ggml_get_unary_op(tensor)) {
+                case GGML_UNARY_OP_GELU:
+                    if (!any_on_device) {
+                        return false;
+                    }
+                    func = ggml_cl_gelu;
+                    break;
+                case GGML_UNARY_OP_SILU:
+                    if (!any_on_device) {
+                        return false;
+                    }
+                    func = ggml_cl_silu;
+                    break;
+                case GGML_UNARY_OP_RELU:
+                    if (!any_on_device) {
+                        return false;
+                    }
+                    func = ggml_cl_relu;
+                    break;
+                default:
+                    return false;
+            } break;
+        case GGML_OP_CLAMP:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cl_clamp;
+            break;
+        case GGML_OP_NORM:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cl_norm;
+            break;
+        case GGML_OP_RMS_NORM:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cl_rms_norm;
+            break;
+        case GGML_OP_MUL_MAT:
+            if (!any_on_device && !ggml_cl_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
+                return false;
+            }
+            func = ggml_cl_mul_mat;
+            break;
+        case GGML_OP_SCALE:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cl_scale;
+            break;
+        case GGML_OP_RESHAPE:
+        case GGML_OP_VIEW:
+        case GGML_OP_PERMUTE:
+        case GGML_OP_TRANSPOSE:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cl_nop;
+            break;
+        case GGML_OP_DIAG_MASK_INF:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cl_diag_mask_inf;
+            break;
+        case GGML_OP_SOFT_MAX:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cl_soft_max;
+            break;
+        case GGML_OP_ROPE:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cl_rope;
+            break;
+        default:
+            return false;
+    }
+
+    func(backend, tensor->src[0], tensor->src[1], tensor);
+    return true;
+}
diff --git a/ggml/src/ggml-opencl/kernels/embed_kernel.py b/ggml/src/ggml-opencl/kernels/embed_kernel.py
new file mode 100644
index 00000000000..b5d1d7242b6
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/embed_kernel.py
@@ -0,0 +1,26 @@
+#
+
+import sys
+import logging
+logger = logging.getLogger("opencl-embed-kernel")
+
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+
+    if len(sys.argv) != 3:
+        logger.info("Usage: python embed_kernel.py <input_file> <output_file>")
+        sys.exit(1)
+
+    ifile = open(sys.argv[1], "r")
+    ofile = open(sys.argv[2], "w")
+
+    for i in ifile:
+        ofile.write('R"({})"\n'.format(i))
+
+    ifile.close()
+    ofile.close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ggml/src/ggml-opencl/kernels/ggml-opencl.cl b/ggml/src/ggml-opencl/kernels/ggml-opencl.cl
new file mode 100644
index 00000000000..d1cdf709bab
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl.cl
@@ -0,0 +1,2683 @@
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#elif defined(cl_amd_fp16)
+#pragma OPENCL EXTENSION cl_amd_fp16 : enable
+#else
+#error "Half precision floating point not supportedby OpenCL implementation on your device."
+#endif
+
+#ifdef cl_khr_subgroups
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#elif defined(cl_intel_subgroups)
+#pragma OPENCL EXTENSION cl_intel_subgroups : enable
+#else
+#error "Subgroup not supported on your device."
+#endif
+
+#ifdef cl_intel_required_subgroup_size
+// Always use subgroup size of 32 on Intel.
+#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
+#define INTEL_GPU 1
+#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
+#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
+#elif defined(cl_qcom_reqd_sub_group_size)
+// Always use subgroups size of 64 on Adreno.
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+#define ADRENO_GPU 1
+#define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
+#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
+#else
+// TODO: do not know how to choose subgroup size on other GPUs.
+#error "Selecting subgroup size is not supported on your device."
+#endif
+
+#define QK4_0                   32
+#define QR4_0                   2
+#define QK4_1                   32
+#define QR4_1                   2
+#define QK5_0                   32
+#define QR5_0                   2
+#define QK5_1                   32
+#define QR5_1                   2
+#define QK8_0                   32
+#define QR8_0                   1
+#define QK_K                    256
+#define K_QUANTS_PER_ITERATION  2
+
+typedef char int8_t;
+typedef uchar uint8_t;
+typedef short int16_t;
+typedef ushort uint16_t;
+typedef int int32_t;
+typedef uint uint32_t;
+
+//------------------------------------------------------------------------------
+// block_q4_0
+//------------------------------------------------------------------------------
+struct block_q4_0
+{
+    half d;
+    uint8_t qs[QK4_0 / 2];
+};
+
+//------------------------------------------------------------------------------
+// block_q4_1
+//------------------------------------------------------------------------------
+struct block_q4_1
+{
+    half d;
+    half m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+//------------------------------------------------------------------------------
+// block_q5_0
+//------------------------------------------------------------------------------
+struct block_q5_0
+{
+    half d;
+    uint32_t qh;
+    uint8_t qs[QK5_0 / 2];
+};
+
+//------------------------------------------------------------------------------
+// block_q5_1
+//------------------------------------------------------------------------------
+struct block_q5_1
+{
+    half d;
+    half m;
+    uint32_t qh;
+    uint8_t qs[QK5_1 / 2];
+};
+
+//------------------------------------------------------------------------------
+// block_q8_0
+//------------------------------------------------------------------------------
+struct block_q8_0
+{
+    half d;
+    int8_t qs[QK8_0];
+};
+
+//------------------------------------------------------------------------------
+// block_q2_K
+//------------------------------------------------------------------------------
+struct block_q2_K
+{
+    uint8_t scales[16];
+    uint8_t qs[64];
+    half d;
+    half dmin;
+};
+
+//------------------------------------------------------------------------------
+// block_q3_K
+//------------------------------------------------------------------------------
+struct block_q3_K
+{
+    uint8_t hmask[32];
+    uint8_t qs[64];
+    uint8_t scales[12];
+    half d;
+};
+
+//------------------------------------------------------------------------------
+// block_q4_K
+//------------------------------------------------------------------------------
+struct block_q4_K
+{
+    half d;
+    half dmin;
+    uint8_t scales[12];
+    uint8_t qs[128];
+};
+
+//------------------------------------------------------------------------------
+// block_q5_K
+//------------------------------------------------------------------------------
+struct block_q5_K
+{
+    half d;
+    half dmin;
+    uint8_t scales[12];
+    uint8_t qh[32];
+    uint8_t qs[128];
+};
+
+//------------------------------------------------------------------------------
+// block_q6_K
+//------------------------------------------------------------------------------
+struct block_q6_K
+{
+    uint8_t ql[128];
+    uint8_t qh[64];
+    int8_t scales[16];
+    half d;
+};
+
+//------------------------------------------------------------------------------
+// dequantize_q4_0_f32, dequantize_q4_0_f16
+//------------------------------------------------------------------------------
+void dequantize_q4_0_f32(global struct block_q4_0 * xb, short il, float16 * reg) {
+    global ushort * qs = ((global ushort *)xb + 1);
+    float d1 = il ? (xb->d / 16.h) : xb->d;
+    float d2 = d1 / 256.f;
+    float md = -8.h * xb->d;
+    ushort mask0 = il ? 0x00F0 : 0x000F;
+    ushort mask1 = mask0 << 8;
+
+    reg->s0 = d1 * (qs[0] & mask0) + md;
+    reg->s1 = d2 * (qs[0] & mask1) + md;
+
+    reg->s2 = d1 * (qs[1] & mask0) + md;
+    reg->s3 = d2 * (qs[1] & mask1) + md;
+
+    reg->s4 = d1 * (qs[2] & mask0) + md;
+    reg->s5 = d2 * (qs[2] & mask1) + md;
+
+    reg->s6 = d1 * (qs[3] & mask0) + md;
+    reg->s7 = d2 * (qs[3] & mask1) + md;
+
+    reg->s8 = d1 * (qs[4] & mask0) + md;
+    reg->s9 = d2 * (qs[4] & mask1) + md;
+
+    reg->sa = d1 * (qs[5] & mask0) + md;
+    reg->sb = d2 * (qs[5] & mask1) + md;
+
+    reg->sc = d1 * (qs[6] & mask0) + md;
+    reg->sd = d2 * (qs[6] & mask1) + md;
+
+    reg->se = d1 * (qs[7] & mask0) + md;
+    reg->sf = d2 * (qs[7] & mask1) + md;
+}
+
+void dequantize_q4_0_f16(global struct block_q4_0 * xb, short il, half16 * reg) {
+    global ushort * qs = ((global ushort *)xb + 1);
+    half d1 = il ? (xb->d / 16.h) : xb->d;
+    half d2 = d1 / 256.h;
+    half md = -8.h * xb->d;
+    ushort mask0 = il ? 0x00F0 : 0x000F;
+    ushort mask1 = mask0 << 8;
+
+    reg->s0 = d1 * (qs[0] & mask0) + md;
+    reg->s1 = d2 * (qs[0] & mask1) + md;
+
+    reg->s2 = d1 * (qs[1] & mask0) + md;
+    reg->s3 = d2 * (qs[1] & mask1) + md;
+
+    reg->s4 = d1 * (qs[2] & mask0) + md;
+    reg->s5 = d2 * (qs[2] & mask1) + md;
+
+    reg->s6 = d1 * (qs[3] & mask0) + md;
+    reg->s7 = d2 * (qs[3] & mask1) + md;
+
+    reg->s8 = d1 * (qs[4] & mask0) + md;
+    reg->s9 = d2 * (qs[4] & mask1) + md;
+
+    reg->sa = d1 * (qs[5] & mask0) + md;
+    reg->sb = d2 * (qs[5] & mask1) + md;
+
+    reg->sc = d1 * (qs[6] & mask0) + md;
+    reg->sd = d2 * (qs[6] & mask1) + md;
+
+    reg->se = d1 * (qs[7] & mask0) + md;
+    reg->sf = d2 * (qs[7] & mask1) + md;
+}
+
+//------------------------------------------------------------------------------
+// add
+//------------------------------------------------------------------------------
+
+// general-purpose kernel for addition of two tensors
+// pros: works for non-contiguous tensors, supports broadcast across dims 1, 2 and 3
+// cons: not very efficient
+kernel void kernel_add(
+        global char * src0,
+        ulong  offset0,
+        global char * src1,
+        ulong  offset1,
+        global char * dst,
+        ulong  offsetd,
+        int   ne00,
+        int   ne01,
+        int   ne02,
+        int   ne03,
+        ulong nb00,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        int   ne10,
+        int   ne11,
+        int   ne12,
+        int   ne13,
+        ulong nb10,
+        ulong nb11,
+        ulong nb12,
+        ulong nb13,
+        int   ne0,
+        int   ne1,
+        int   ne2,
+        int   ne3,
+        ulong nb0,
+        ulong nb1,
+        ulong nb2,
+        ulong nb3
+) {
+    src0 = src0 + offset0;
+    src1 = src1 + offset1;
+    dst = dst + offsetd;
+
+    int i03 = get_group_id(2);
+    int i02 = get_group_id(1);
+    int i01 = get_group_id(0);
+
+    int i13 = i03 % ne13;
+    int i12 = i02 % ne12;
+    int i11 = i01 % ne11;
+
+    global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
+    global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
+    global char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1;
+
+    for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
+        const int i10 = i0 % ne10;
+        *((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i0*nb00)) + *((global float *)(src1_ptr + i10*nb10));
+    }
+}
+
+// assumption: src1 is a row
+// broadcast src1 into src0
+kernel void kernel_add_row(
+        global float4 * src0,
+        ulong  offset0,
+        global float4 * src1,
+        ulong  offset1,
+        global float4 * dst,
+        ulong  offsetd,
+        int ne
+) {
+    src0 = (global float4*)((global char*)src0 + offset0);
+    src1 = (global float4*)((global char*)src1 + offset1);
+    dst = (global float4*)((global char*)dst + offsetd);
+
+    // This performs better than using %.
+    uint gid = get_global_id(0);
+    uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
+    dst[gid] = src0[gid] + src1[idx1];
+}
+
+//------------------------------------------------------------------------------
+// mul
+//------------------------------------------------------------------------------
+kernel void kernel_mul(
+        global char * src0,
+        ulong offset0,
+        global char * src1,
+        ulong offset1,
+        global char * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne03,
+        ulong nb00,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        int ne10,
+        int ne11,
+        int ne12,
+        int ne13,
+        ulong nb10,
+        ulong nb11,
+        ulong nb12,
+        ulong nb13,
+        int ne0,
+        int ne1,
+        int ne2,
+        int ne3,
+        ulong nb0,
+        ulong nb1,
+        ulong nb2,
+        ulong nb3
+) {
+    src0 = src0 + offset0;
+    src1 = src1 + offset1;
+    dst  = dst + offsetd;
+
+    int i03 = get_group_id(2);
+    int i02 = get_group_id(1);
+    int i01 = get_group_id(0);
+
+    int i13 = i03 % ne13;
+    int i12 = i02 % ne12;
+    int i11 = i01 % ne11;
+
+    global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
+    global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
+    global char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1;
+
+    for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
+        const int i10 = i0 % ne10;
+        *((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i0*nb00)) * *((global float *)(src1_ptr + i10*nb10));
+    }
+}
+
+// assumption: src1 is a row
+// broadcast src1 into src0
+kernel void kernel_mul_row(
+        global float4 * src0,
+        ulong offset0,
+        global float4 * src1,
+        ulong offset1,
+        global float4 * dst,
+        ulong offsetd,
+        int ne
+) {
+    src0 = (global float4*)((global char*)src0 + offset0);
+    src1 = (global float4*)((global char*)src1 + offset1);
+    dst = (global float4*)((global char*)dst + offsetd);
+
+    // This performs better than using %.
+    uint gid = get_global_id(0);
+    uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
+    dst[gid] = src0[gid] * src1[idx1];
+}
+
+//------------------------------------------------------------------------------
+// scale
+//------------------------------------------------------------------------------
+kernel void kernel_scale(
+        global float4 * src0,
+        ulong offset0,
+        global float4 * dst,
+        ulong offsetd,
+        float scale
+) {
+    src0 = (global float4*)((global char*)src0 + offset0);
+    dst = (global float4*)((global char*)dst + offsetd);
+    dst[get_global_id(0)] = src0[get_global_id(0)] * scale;
+}
+
+//------------------------------------------------------------------------------
+// gelu
+//------------------------------------------------------------------------------
+#define GELU_COEF_A     0.044715f
+#define SQRT_2_OVER_PI  0.79788456080286535587989211986876f
+
+kernel void kernel_gelu(
+    global float * src0,
+    ulong offset0,
+    global float * dst,
+    ulong offsetd
+) {
+    src0 = (global float*)((global char*)src0 + offset0);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    float x = src0[get_global_id(0)];
+
+    dst[get_global_id(0)] = 0.5f*x*(1.0f + tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
+}
+
+kernel void kernel_gelu_4(
+    global float4 * src0,
+    ulong offset0,
+    global float4 * dst,
+    ulong offsetd
+) {
+    src0 = (global float4*)((global char*)src0 + offset0);
+    dst = (global float4*)((global char*)dst + offsetd);
+
+    float4 x = src0[get_global_id(0)];
+
+    dst[get_global_id(0)] = 0.5f*x*(1.0f + tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
+}
+
+//------------------------------------------------------------------------------
+// silu
+//------------------------------------------------------------------------------
+kernel void kernel_silu(
+        global float * src0,
+        ulong offset0,
+        global float * dst,
+        ulong offsetd
+) {
+    src0 = (global float*)((global char*)src0 + offset0);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    float x = src0[get_global_id(0)];
+    dst[get_global_id(0)] = x / (1.0f + exp(-x));
+}
+
+kernel void kernel_silu_4(
+        global float4 * src0,
+        ulong offset0,
+        global float4 * dst,
+        ulong offsetd
+) {
+    src0 = (global float4*)((global char*)src0 + offset0);
+    dst = (global float4*)((global char*)dst + offsetd);
+
+    float4 x = src0[get_global_id(0)];
+    dst[get_global_id(0)] = x / (1.0f + exp(-x));
+}
+
+//------------------------------------------------------------------------------
+// relu
+//------------------------------------------------------------------------------
+kernel void kernel_relu(
+        global float * src0,
+        ulong offset0,
+        global float * dst,
+        ulong offsetd
+) {
+    src0 = (global float*)((global char*)src0 + offset0);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    dst[get_global_id(0)] = fmax(0.0f, src0[get_global_id(0)]);
+}
+
+//------------------------------------------------------------------------------
+// clamp
+//------------------------------------------------------------------------------
+kernel void kernel_clamp(
+        global float * src0,
+        ulong offset0,
+        global float * dst,
+        ulong offsetd,
+        float min,
+        float max
+) {
+    src0 = (global float*)((global char*)src0 + offset0);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    dst[get_global_id(0)] = src0[get_global_id(0)] < min ?
+        min :
+        (src0[get_global_id(0)] > max ? max : src0[get_global_id(0)]);
+}
+
+//------------------------------------------------------------------------------
+// norm
+//------------------------------------------------------------------------------
+kernel void kernel_norm(
+        global void * src0,
+        ulong offset0,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        ulong nb01,
+        float eps,
+        local float * sum
+) {
+    src0 = (global void*)((global char*)src0 + offset0);
+    dst = (global void*)((global char*)dst + offsetd);
+
+    global float * x = (global float *) ((global char *) src0 + get_group_id(0)*nb01);
+
+    // MEAN
+    // parallel sum
+    sum[get_local_id(0)] = 0.0f;
+    for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
+        sum[get_local_id(0)] += x[i00];
+    }
+    // reduce
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for (uint i = get_local_size(0)/2; i > 0; i /= 2) {
+        if (get_local_id(0) < i) {
+            sum[get_local_id(0)] += sum[get_local_id(0) + i];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    float mean  = sum[0] / ne00;
+
+    // recenter and VARIANCE
+    barrier(CLK_LOCAL_MEM_FENCE);
+    global float * y = dst + get_group_id(0)*ne00;
+    sum[get_local_id(0)] = 0.0f;
+    for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
+        y[i00] = x[i00] - mean;
+        sum[get_local_id(0)] += y[i00] * y[i00];
+    }
+
+    // reduce
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for (uint i = get_local_size(0)/2; i > 0; i /= 2) {
+        if (get_local_id(0) < i) {
+            sum[get_local_id(0)] += sum[get_local_id(0) + i];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    float variance = sum[0] / ne00;
+
+    float scale = 1.0f/sqrt(variance + eps);
+    for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
+        y[i00] = y[i00] * scale;
+    }
+}
+
+//------------------------------------------------------------------------------
+// rms_norm
+//------------------------------------------------------------------------------
+// This kernel depends on subgroup size.
+kernel void kernel_rms_norm(
+        global void * src0,
+        ulong offset0,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        ulong nb01,
+        float eps,
+        local float * sum // Note, the size depends on number of subgroups
+) {
+    src0 = (global void*)((global char*)src0 + offset0);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    global float4 * x = (global float4 *) ((global char *) src0 + get_group_id(0)*nb01);
+    global float * x_scalar = (global float *) x;
+    float4 sumf = 0;
+    float all_sum = 0;
+
+    // parallel sum
+    for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
+        sumf += x[i00] * x[i00];
+    }
+    all_sum = sumf.s0 + sumf.s1 + sumf.s2 + sumf.s3;
+    all_sum = sub_group_reduce_add(all_sum);
+    if (get_sub_group_local_id() == 0) {
+        sum[get_sub_group_id()] = all_sum;
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    // broadcast
+    for (uint i = get_local_size(0) / get_max_sub_group_size() / 2; i > 0; i /= 2) {
+       if (get_local_id(0) < i) {
+           sum[get_local_id(0)] += sum[get_local_id(0) + i];
+       }
+    }
+    if (get_local_id(0) == 0) {
+        for (int i = 4 * (ne00 / 4); i < ne00; i++) {
+            sum[0] += x_scalar[i];
+        }
+        sum[0] /= ne00;
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    const float mean  = sum[0];
+    const float scale = 1.0f/sqrt(mean + eps);
+
+    global float4 * y = (global float4 *) (dst + get_group_id(0)*ne00);
+    global float * y_scalar = (global float *) y;
+    for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
+        y[i00] = x[i00] * scale;
+    }
+    if (get_local_id(0) == 0) {
+        for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {
+            y_scalar[i00] = x_scalar[i00] * scale;
+        }
+    }
+}
+
+//------------------------------------------------------------------------------
+// diag_mask_inf kernels
+//------------------------------------------------------------------------------
+kernel void kernel_diag_mask_inf(
+        global float * src0,
+        ulong offset0,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int n_past
+) {
+    src0 = (global float*)((global char*)src0 + offset0);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    int i02 = get_global_id(2);
+    int i01 = get_global_id(1);
+    int i00 = get_global_id(0);
+
+    if (i00 > n_past + i01) {
+        dst[i02*ne01*ne00 + i01*ne00 + i00] = -INFINITY;
+    } else {
+        dst[i02*ne01*ne00 + i01*ne00 + i00] = src0[i02*ne01*ne00 + i01*ne00 + i00];
+    }
+}
+
+kernel void kernel_diag_mask_inf_8(
+        global float4 * src0,
+        ulong offset0,
+        global float4 * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int n_past
+) {
+    src0 = (global float4*)((global char*)src0 + offset0);
+    dst = (global float4*)((global char*)dst + offsetd);
+
+    int i = 2*get_global_id(0);
+
+    dst[i+0] = src0[i+0];
+    dst[i+1] = src0[i+1];
+    int i4 = 4*i;
+    int i02 = i4/(ne00*ne01); i4 -= i02*ne00*ne01;
+    int i01 = i4/(ne00);      i4 -= i01*ne00;
+    int i00 = i4;
+    for (int k = 3; k >= 0; --k) {
+        if (i00 + 4 + k <= n_past + i01) {
+            break;
+        }
+        (&dst[i+1])[k] = -INFINITY;
+        if (i00 + k > n_past + i01) {
+            (&dst[i])[k] = -INFINITY;
+        }
+    }
+}
+
+//------------------------------------------------------------------------------
+// softmax
+//------------------------------------------------------------------------------
+kernel void kernel_soft_max(
+        global float * src0,
+        ulong offset0,
+        global float * src1,
+        ulong offset1,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        float scale,
+        float max_bias,
+        float m0,
+        float m1,
+        int n_head_log2
+) {
+    src0 = (global float*)((global char*)src0 + offset0);
+    src1 = (global float*)((global char*)src1 + offset1);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    int i03 = get_group_id(2);
+    int i02 = get_group_id(1);
+    int i01 = get_group_id(0);
+
+    global float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+    global float * pmask = src1 != src0 ? src1 + i01*ne00 : 0;
+    global float * pdst  = dst  + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+
+    float slope = 1.0f;
+
+    // ALiBi
+    if (max_bias > 0.0f) {
+        int h = i02;
+
+        float base = h < n_head_log2 ? m0 : m1;
+        int   exp  = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
+
+        slope = pow(base, exp);
+    }
+
+    // parallel max
+    float lmax = -INFINITY;
+    for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
+        lmax = fmax(lmax, psrc0[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f));
+    }
+    float max = sub_group_reduce_max(lmax);
+
+    // parallel sum
+    float lsum = 0.0f;
+    for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
+        float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f)) - max);
+        lsum += exp_psrc0;
+        // Remember the result of exp here. exp is expensive, so we really do not
+        // wish to compute it twice.
+        pdst[i00] = exp_psrc0;
+    }
+
+    const float sum = sub_group_reduce_add(lsum);
+
+    for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
+        pdst[i00] /= sum;
+    }
+}
+
+#ifdef ADRENO_GPU
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_soft_max_4(
+        global float * src0,
+        ulong offset0,
+        global float * src1,
+        ulong offset1,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        float scale,
+        float max_bias,
+        float m0,
+        float m1,
+        int n_head_log2
+) {
+    src0 = (global float*)((global char*)src0 + offset0);
+    src1 = (global float*)((global char*)src1 + offset1);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    int i03 = get_group_id(2);
+    int i02 = get_group_id(1);
+    int i01 = get_group_id(0);
+
+    global float4 * psrc4 = (global float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
+    global float4 * pmask = src1 != src0 ? (global float4 *)(src1 + i01*ne00) : 0;
+    global float4 * pdst4 = (global float4 *)(dst  + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
+
+    float slope = 1.0f;
+
+    // ALiBi
+    if (max_bias > 0.0f) {
+        int h = i02;
+
+        float base = h < n_head_log2 ? m0 : m1;
+        int   exp  = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
+
+        slope = pow(base, exp);
+    }
+
+    // parallel max
+    float4 lmax4 = -INFINITY;
+    for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
+        lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f));
+    }
+    float lmax = fmax(fmax(lmax4.s0, lmax4.s1), fmax(lmax4.s2, lmax4.s3));
+
+    const float max = sub_group_reduce_max(lmax);
+
+    // parallel sum
+    float4 lsum4 = 0.0f;
+    for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
+        const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f)) - max);
+        lsum4 += exp_psrc4;
+        pdst4[i00] = exp_psrc4;
+    }
+    float lsum = lsum4.s0 + lsum4.s1 + lsum4.s2 + lsum4.s3;
+
+    const float sum = sub_group_reduce_add(lsum);
+
+    for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
+        pdst4[i00] /= sum;
+    }
+}
+
+//------------------------------------------------------------------------------
+// kernel_rope
+//------------------------------------------------------------------------------
+float rope_yarn_ramp(float low, float high, int i0) {
+    const float y = (i0 / 2 - low) / max(0.001f, high - low);
+    return 1.0f - min(1.0f, max(0.0f, y));
+}
+
+// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
+// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
+float2 rope_yarn(
+    float theta_extrap, float freq_scale, float2 corr_dims, int i0, float ext_factor, float mscale
+) {
+    // Get n-d rotational scaling corrected for extrapolation
+    float theta_interp = freq_scale * theta_extrap;
+    float theta = theta_interp;
+    if (ext_factor != 0.0f) {
+        float ramp_mix = rope_yarn_ramp(corr_dims.s0, corr_dims.s1, i0) * ext_factor;
+        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
+
+        // Get n-d magnitude scaling corrected for interpolation
+        mscale *= 1.0f + 0.1f * log(1.0f / freq_scale);
+    }
+    return (float2)(cos(theta) * mscale, sin(theta) * mscale);
+}
+
+// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
+// `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
+float rope_yarn_corr_factor(int n_dims, int n_ctx_orig, float n_rot, float base) {
+    return n_dims * log(n_ctx_orig / (n_rot * 2 * M_PI_F)) / (2 * log(base));
+}
+
+float2 rope_yarn_corr_dims(
+    int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow
+) {
+    // start and end correction dims
+    return (float2)(
+        max(0.0f,         floor(rope_yarn_corr_factor(n_dims, n_ctx_orig, beta_fast, freq_base))),
+        min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_ctx_orig, beta_slow, freq_base)))
+    );
+}
+
+kernel void kernel_rope_norm_f32(
+        global void * src0,
+        ulong offset0,
+        global int * src1,
+        ulong offset1,
+        global float * src2,
+        ulong offset2,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne03,
+        ulong nb00,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        int ne0,
+        int ne1,
+        int ne2,
+        int ne3,
+        ulong nb0,
+        ulong nb1,
+        ulong nb2,
+        ulong nb3,
+        int n_past,
+        int n_dims,
+        int n_ctx_orig,
+        float freq_base,
+        float freq_scale,
+        float ext_factor,
+        float attn_factor,
+        float beta_fast,
+        float beta_slow
+) {
+    src0 = (global void*)((global char*)src0 + offset0);
+    src1 = (global int*)((global char*)src1 + offset1);
+    src2 = (global float*)((global char*)src2 + offset2);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    int i3 = get_group_id(2);
+    int i2 = get_group_id(1);
+    int i1 = get_group_id(0);
+
+    float2 corr_dims = rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow);
+
+    global int * pos = src1;
+
+    float theta_base = (float) pos[i2];
+    float inv_ndims = -1.f/n_dims;
+
+    for (int i0 = 2*get_local_id(0); i0 < ne0; i0 += 2*get_local_size(0)) {
+        if (i0 < n_dims) {
+            int ic = i0/2;
+
+            float theta = theta_base * pow(freq_base, inv_ndims*i0);
+
+            float freq_factor = src2 != src0 ? src2[ic] : 1.0f;
+
+            float2 cos_sin_theta = rope_yarn(theta/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor);
+
+            global float * src       = (global float *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+            global float * dst_data  = (global float *)((global char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+            float x0 = src[0];
+            float x1 = src[1];
+
+            dst_data[0] = x0*cos_sin_theta.s0 - x1*cos_sin_theta.s1;
+            dst_data[1] = x0*cos_sin_theta.s1 + x1*cos_sin_theta.s0;
+        } else {
+            global float * src      = (global float *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+            global float * dst_data = (global float *)((global char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+            dst_data[0] = src[0];
+            dst_data[1] = src[1];
+        }
+    }
+}
+
+kernel void kernel_rope_norm_f16(
+        global void * src0,
+        ulong offset0,
+        global int * src1,
+        ulong offset1,
+        global float * src2,
+        ulong offset2,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne03,
+        ulong nb00,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        int ne0,
+        int ne1,
+        int ne2,
+        int ne3,
+        ulong nb0,
+        ulong nb1,
+        ulong nb2,
+        ulong nb3,
+        int n_past,
+        int n_dims,
+        int n_ctx_orig,
+        float freq_base,
+        float freq_scale,
+        float ext_factor,
+        float attn_factor,
+        float beta_fast,
+        float beta_slow
+) {
+    src0 = (global void*)((global char*)src0 + offset0);
+    src1 = (global int*)((global char*)src1 + offset1);
+    src2 = (global float*)((global char*)src2 + offset2);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    int i3 = get_group_id(2);
+    int i2 = get_group_id(1);
+    int i1 = get_group_id(0);
+
+    float2 corr_dims = rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow);
+
+    global int * pos = src1;
+
+    float theta_base = (float) pos[i2];
+    float inv_ndims = -1.f/n_dims;
+
+    for (int i0 = 2*get_local_id(0); i0 < ne0; i0 += 2*get_local_size(0)) {
+        if (i0 < n_dims) {
+            int ic = i0/2;
+
+            float theta = theta_base * pow(freq_base, inv_ndims*i0);
+
+            float freq_factor = src2 != src0 ? src2[ic] : 1.0f;
+
+            float2 cos_sin_theta = rope_yarn(theta/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor);
+
+            global half * src       = (global half *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+            global half * dst_data  = (global half *)((global char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+            float x0 = src[0];
+            float x1 = src[1];
+
+            dst_data[0] = x0*cos_sin_theta.s0 - x1*cos_sin_theta.s1;
+            dst_data[1] = x0*cos_sin_theta.s1 + x1*cos_sin_theta.s0;
+        } else {
+            global half * src      = (global half *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+            global half * dst_data = (global half *)((global char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+            dst_data[0] = src[0];
+            dst_data[1] = src[1];
+        }
+    }
+}
+
+kernel void kernel_rope_neox_f32(
+        global void * src0,
+        ulong offset0,
+        global int * src1,
+        ulong offset1,
+        global float * src2,
+        ulong offset2,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne03,
+        ulong nb00,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        int ne0,
+        int ne1,
+        int ne2,
+        int ne3,
+        ulong nb0,
+        ulong nb1,
+        ulong nb2,
+        ulong nb3,
+        int n_past,
+        int n_dims,
+        int n_ctx_orig,
+        float freq_base,
+        float freq_scale,
+        float ext_factor,
+        float attn_factor,
+        float beta_fast,
+        float beta_slow
+) {
+    src0 = (global void*)((global char*)src0 + offset0);
+    src1 = (global int*)((global char*)src1 + offset1);
+    src2 = (global float*)((global char*)src2 + offset2);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    int i3 = get_group_id(2);
+    int i2 = get_group_id(1);
+    int i1 = get_group_id(0);
+
+    float2 corr_dims = rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow);
+
+    global int * pos = src1;
+
+    float theta_base = (float) pos[i2];
+    float inv_ndims = -1.f/n_dims;
+
+    for (int i0 = 2*get_local_id(0); i0 < ne0; i0 += 2*get_local_size(0)) {
+        if (i0 < n_dims) {
+            int ic = i0/2;
+
+            const float theta = theta_base * pow(freq_base, inv_ndims*i0);
+
+            const float freq_factor = src2 != src0 ? src2[ic] : 1.0f;
+
+            float2 cos_sin_theta = rope_yarn(theta/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor);
+
+            global float * src      = (global float *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
+            global float * dst_data = (global float *)((global char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
+
+            const float x0 = src[0];
+            const float x1 = src[n_dims/2];
+
+            dst_data[0]        = x0*cos_sin_theta.s0 - x1*cos_sin_theta.s1;
+            dst_data[n_dims/2] = x0*cos_sin_theta.s1 + x1*cos_sin_theta.s0;
+        } else {
+            global float * const src = (global float *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+            global float * dst_data  = (global float *)((global char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+            dst_data[0] = src[0];
+            dst_data[1] = src[1];
+        }
+    }
+}
+
+kernel void kernel_rope_neox_f16(
+        global void * src0,
+        ulong offset0,
+        global int * src1,
+        ulong offset1,
+        global float * src2,
+        ulong offset2,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne03,
+        ulong nb00,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        int ne0,
+        int ne1,
+        int ne2,
+        int ne3,
+        ulong nb0,
+        ulong nb1,
+        ulong nb2,
+        ulong nb3,
+        int n_past,
+        int n_dims,
+        int n_ctx_orig,
+        float freq_base,
+        float freq_scale,
+        float ext_factor,
+        float attn_factor,
+        float beta_fast,
+        float beta_slow
+) {
+    src0 = (global void*)((global char*)src0 + offset0);
+    src1 = (global int*)((global char*)src1 + offset1);
+    src2 = (global float*)((global char*)src2 + offset2);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    int i3 = get_group_id(2);
+    int i2 = get_group_id(1);
+    int i1 = get_group_id(0);
+
+    float2 corr_dims = rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow);
+
+    global int * pos = src1;
+
+    float theta_base = (float) pos[i2];
+    float inv_ndims = -1.f/n_dims;
+
+    for (int i0 = 2*get_local_id(0); i0 < ne0; i0 += 2*get_local_size(0)) {
+        if (i0 < n_dims) {
+            int ic = i0/2;
+
+            const float theta = theta_base * pow(freq_base, inv_ndims*i0);
+
+            const float freq_factor = src2 != src0 ? src2[ic] : 1.0f;
+
+            float2 cos_sin_theta = rope_yarn(theta/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor);
+
+            global half * src       = (global half *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
+            global half * dst_data  = (global half *)((global char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
+
+            const float x0 = src[0];
+            const float x1 = src[n_dims/2];
+
+            dst_data[0]        = x0*cos_sin_theta.s0 - x1*cos_sin_theta.s1;
+            dst_data[n_dims/2] = x0*cos_sin_theta.s1 + x1*cos_sin_theta.s0;
+        } else {
+            global half * const src = (global half *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+            global half * dst_data  = (global half *)((global char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+            dst_data[0] = src[0];
+            dst_data[1] = src[1];
+        }
+    }
+}
+
+//------------------------------------------------------------------------------
+// cpy
+//------------------------------------------------------------------------------
+
+kernel void kernel_cpy_f16_f16(
+        global half * src0,
+        ulong offset0,
+        global half * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne03,
+        ulong nb00,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        int ne0,
+        int ne1,
+        int ne2,
+        int ne3,
+        ulong nb0,
+        ulong nb1,
+        ulong nb2,
+        ulong nb3
+) {
+    src0 = (global half*)((global char*)src0 + offset0);
+    dst = (global half*)((global char*)dst + offsetd);
+
+    int i03 = get_group_id(2);
+    int i02 = get_group_id(1);
+    int i01 = get_group_id(0);
+
+    int n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+
+    int i3 = n / (ne2*ne1*ne0);
+    int i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
+    int i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
+    int i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
+
+    global half * dst_data = (global half *) ((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+
+    for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
+        global const half * src = (global half *)((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
+        dst_data[i00] = src[0];
+    }
+}
+
+kernel void kernel_cpy_f16_f32(
+        global half * src0,
+        ulong offset0,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne03,
+        ulong nb00,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        int ne0,
+        int ne1,
+        int ne2,
+        int ne3,
+        ulong nb0,
+        ulong nb1,
+        ulong nb2,
+        ulong nb3
+) {
+
+    src0 = (global half*)((global char*)src0 + offset0);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    int i03 = get_group_id(2);
+    int i02 = get_group_id(1);
+    int i01 = get_group_id(0);
+
+    int n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+
+    int i3 = n / (ne2*ne1*ne0);
+    int i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
+    int i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
+    int i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
+
+    global float * dst_data = (global float *) ((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+
+    for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
+        global half * src = (global half *)((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
+        dst_data[i00] = src[0];
+    }
+}
+
+kernel void kernel_cpy_f32_f16(
+        global float * src0,
+        ulong offset0,
+        global half * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne03,
+        ulong nb00,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        int ne0,
+        int ne1,
+        int ne2,
+        int ne3,
+        ulong nb0,
+        ulong nb1,
+        ulong nb2,
+        ulong nb3
+) {
+    src0 = (global float*)((global char*)src0 + offset0);
+    dst = (global half*)((global char*)dst + offsetd);
+
+    int i03 = get_group_id(2);
+    int i02 = get_group_id(1);
+    int i01 = get_group_id(0);
+
+    int n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+
+    int i3 = n / (ne2*ne1*ne0);
+    int i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
+    int i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
+    int i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
+
+    global half * dst_data = (global half *) ((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+
+    for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
+        global const float * src = (global float *)((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
+
+        dst_data[i00] = src[0];
+    }
+}
+
+kernel void kernel_cpy_f32_f32(
+        global float * src0,
+        ulong offset0,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne03,
+        ulong nb00,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        int ne0,
+        int ne1,
+        int ne2,
+        int ne3,
+        ulong nb0,
+        ulong nb1,
+        ulong nb2,
+        ulong nb3
+) {
+    src0 = (global float*)((global char*)src0 + offset0);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    int i03 = get_group_id(2);
+    int i02 = get_group_id(1);
+    int i01 = get_group_id(0);
+
+    int n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+
+    int i3 = n / (ne2*ne1*ne0);
+    int i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
+    int i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
+    int i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
+
+    global float * dst_data = (global float *) ((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+
+    for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
+        global const float * src = (global float *)((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
+
+        dst_data[i00] = src[0];
+    }
+}
+
+//------------------------------------------------------------------------------
+// get_rows
+//------------------------------------------------------------------------------
+kernel void kernel_get_rows_f32(
+        global void * src0,
+        ulong offset0,
+        global int * src1,
+        ulong offset1,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        ulong nb01,
+        ulong nb02,
+        int ne10,
+        ulong nb10,
+        ulong nb11,
+        ulong nb1,
+        ulong nb2
+) {
+    src0 = (global void*)((global char*)src0 + offset0);
+    src1 = (global int*)((global char*)src1 + offset1);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    int i10 = get_group_id(0);
+    int i11 = get_group_id(1);
+
+    int r = ((global int *) ((global char *) src1 + i11*nb11 + i10*nb10))[0];
+
+    int i02 = i11;
+
+    for (int ind = get_local_id(0); ind < ne00; ind += get_local_size(0)) {
+        ((global float *) ((global char *) dst + i11*nb2 + i10*nb1))[ind] =
+            ((global float *) ((global char *) src0 + r*nb01 + i02*nb02))[ind];
+    }
+}
+
+kernel void kernel_get_rows_f16(
+        global void * src0,
+        ulong offset0,
+        global int * src1,
+        ulong offset1,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        ulong nb01,
+        ulong nb02,
+        int ne10,
+        ulong nb10,
+        ulong nb11,
+        ulong nb1,
+        ulong nb2
+) {
+    src0 = (global void*)((global char*)src0 + offset0);
+    src1 = (global int*)((global char*)src1 + offset1);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    int i10 = get_group_id(0);
+    int i11 = get_group_id(1);
+
+    int r = ((global int32_t *) ((global char *) src1 + i11*nb11 + i10*nb10))[0];
+
+    int i02 = i11;
+
+    for (int ind = get_local_id(0); ind < ne00; ind += get_local_size(0)) {
+        ((global float *) ((global char *) dst + i11*nb2 + i10*nb1))[ind] =
+            ((global half *) ((global char *) src0 + r*nb01 + i02*nb02))[ind];
+    }
+}
+
+kernel void kernel_get_rows_q4_0(
+        global void * src0,
+        ulong offset0,
+        global int * src1,
+        ulong offset1,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        ulong nb01,
+        ulong nb02,
+        int ne10,
+        ulong nb10,
+        ulong nb11,
+        ulong nb1,
+        ulong nb2
+) {
+    src0 = (global void*)((global char*)src0 + offset0);
+    src1 = (global int*)((global char*)src1 + offset1);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    const int NL = 2;
+
+    int i10 = get_group_id(0);
+    int i11 = get_group_id(1);
+
+    int r = ((global int32_t *) ((global char *) src1 + i11*nb11 + i10*nb10))[0];
+
+    int i02 = i11;
+
+    for (int ind = get_local_id(0); ind < ne00/16; ind += get_local_size(0)) {
+        float16 temp;
+        dequantize_q4_0_f32(
+            ((global struct block_q4_0 *) ((global char *) src0 + r*nb01 + i02*nb02)) + ind/NL, ind%NL, &temp);
+        *(((global float16 *) ((global char *) dst + i11*nb2 + i10*nb1)) + ind) = temp;
+    }
+}
+
+//------------------------------------------------------------------------------
+// mul_mat_f32_f32
+//------------------------------------------------------------------------------
+#define N_F32_F32 4
+
+kernel void kernel_mul_mat_f32_f32(
+        global char * src0,
+        ulong offset0,
+        global char * src1,
+        ulong offset1,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        ulong nb00,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        int ne10,
+        int ne11,
+        int ne12,
+        ulong nb10,
+        ulong nb11,
+        ulong nb12,
+        ulong nb13,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+    src0 = (global char*)((global char*)src0 + offset0);
+    src1 = (global char*)((global char*)src1 + offset1);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    int r0 = get_group_id(0);
+    int rb = get_group_id(1)*N_F32_F32;
+    int im = get_group_id(2);
+
+    int i12 = im%ne12;
+    int i13 = im/ne12;
+
+    ulong offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
+
+    global float * x = (global float *) (src0 + offset_src0);
+
+    if (ne00 < 128) {
+        for (int row = 0; row < N_F32_F32; ++row) {
+            int r1 = rb + row;
+            if (r1 >= ne11) {
+                break;
+            }
+
+            ulong offset_src1 = r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
+
+            global float * y = (global float *) (src1 + offset_src1);
+
+            float sumf = 0;
+            for (int i = get_sub_group_local_id(); i < ne00; i += get_max_sub_group_size()) {
+                sumf += (float) x[i] * (float) y[i];
+            }
+
+            float all_sum = sub_group_reduce_add(sumf);
+            if (get_sub_group_local_id() == 0) {
+                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+            }
+        }
+    } else {
+        global float4 * x4 = (global float4 *)x;
+        for (int row = 0; row < N_F32_F32; ++row) {
+            int r1 = rb + row;
+            if (r1 >= ne11) {
+                break;
+            }
+
+            ulong offset_src1 = r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
+
+            global float  * y  = (global float  *) (src1 + offset_src1);
+            global float4 * y4 = (global float4 *) y;
+
+            float sumf = 0;
+            for (int i = get_sub_group_local_id(); i < ne00/4; i += get_max_sub_group_size()) {
+                sumf += (float) x4[i].s0 * y4[i].s0;
+                sumf += (float) x4[i].s1 * y4[i].s1;
+                sumf += (float) x4[i].s2 * y4[i].s2;
+                sumf += (float) x4[i].s3 * y4[i].s3;
+            }
+
+            float all_sum = sub_group_reduce_add(sumf);
+            if (get_sub_group_local_id() == 0) {
+                for (int i = 4*(ne00/4); i < ne00; ++i) {
+                    all_sum += (float) x[i] * y[i];
+                }
+                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+            }
+        }
+    }
+}
+
+//------------------------------------------------------------------------------
+// mul_mat_f16_f16
+//------------------------------------------------------------------------------
+#define N_F16_F16 4
+
+kernel void kernel_mul_mat_f16_f16(
+        global char * src0,
+        ulong offset0,
+        global char * src1,
+        ulong offset1,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        ulong nb00,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        int ne10,
+        int ne11,
+        int ne12,
+        ulong nb10,
+        ulong nb11,
+        ulong nb12,
+        ulong nb13,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3)
+{
+    src0 = (global char*)((global char*)src0 + offset0);
+    src1 = (global char*)((global char*)src1 + offset1);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    int r0 = get_group_id(0);
+    int rb = get_group_id(1)*N_F16_F16;
+    int im = get_group_id(2);
+
+    int i12 = im%ne12;
+    int i13 = im/ne12;
+
+    ulong offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
+
+    global half * x = (global half *) (src0 + offset_src0);
+
+    if (ne00 < 128) {
+        for (int row = 0; row < N_F16_F16; ++row) {
+            int r1 = rb + row;
+            if (r1 >= ne11) {
+                break;
+            }
+
+            ulong offset_src1 = r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
+
+            global half * y = (global half *) (src1 + offset_src1);
+
+            float sumf = 0;
+            for (int i = get_sub_group_local_id(); i < ne00; i += get_max_sub_group_size()) {
+                sumf += (half) x[i] * (half) y[i];
+            }
+
+            float all_sum = sub_group_reduce_add(sumf);
+            if (get_sub_group_local_id() == 0) {
+                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+            }
+        }
+    } else {
+        global half4 * x4 = (global half4 *)x;
+        for (int row = 0; row < N_F16_F16; ++row) {
+            int r1 = rb + row;
+            if (r1 >= ne11) {
+                break;
+            }
+
+            ulong offset_src1 = r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
+
+            global half  * y  = (global half  *) (src1 + offset_src1);
+            global half4 * y4 = (global half4 *) y;
+
+            float sumf = 0;
+            for (int i = get_sub_group_local_id(); i < ne00/4; i += get_max_sub_group_size()) {
+                sumf += (half) x4[i].s0 * y4[i].s0;
+                sumf += (half) x4[i].s1 * y4[i].s1;
+                sumf += (half) x4[i].s2 * y4[i].s2;
+                sumf += (half) x4[i].s3 * y4[i].s3;
+            }
+
+            float all_sum = sub_group_reduce_add(sumf);
+            if (get_sub_group_local_id() == 0) {
+                for (int i = 4*(ne00/4); i < ne00; ++i) {
+                    all_sum += (half) x[i] * y[i];
+                }
+                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+            }
+        }
+    }
+}
+
+//------------------------------------------------------------------------------
+// mul_mat_f16_f32_1row
+//------------------------------------------------------------------------------
+kernel void kernel_mul_mat_f16_f32_1row(
+        global char * src0,
+        ulong offset0,
+        global char * src1,
+        ulong offset1,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        ulong nb00,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        int ne10,
+        int ne11,
+        int ne12,
+        ulong nb10,
+        ulong nb11,
+        ulong nb12,
+        ulong nb13,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+    src0 = (global char*)((global char*)src0 + offset0);
+    src1 = (global char*)((global char*)src1 + offset1);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    int r0 = get_group_id(0);
+    int r1 = get_group_id(1);
+    int im = get_group_id(2);
+
+    int i12 = im%ne12;
+    int i13 = im/ne12;
+
+    ulong offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
+    ulong offset_src1 = r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
+
+    global half  * x = (global half  *) (src0 + offset_src0);
+    global float * y = (global float *) (src1 + offset_src1);
+
+    float sumf = 0;
+    if (ne00 < 128) {
+        for (int i = get_sub_group_local_id(); i < ne00; i += get_max_sub_group_size()) {
+            sumf += (float) x[i] * (float) y[i];
+        }
+        float all_sum = sub_group_reduce_add(sumf);
+        if (get_sub_group_local_id() == 0) {
+            dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+        }
+    } else {
+        global half4  * x4 = (global half4  *) x;
+        global float4 * y4 = (global float4 *) y;
+        for (int i = get_sub_group_local_id(); i < ne00/4; i += get_max_sub_group_size()) {
+            sumf += (float) x4[i].s0 * y4[i].s0;
+            sumf += (float) x4[i].s1 * y4[i].s1;
+            sumf += (float) x4[i].s2 * y4[i].s2;
+            sumf += (float) x4[i].s3 * y4[i].s3;
+        }
+        float all_sum = sub_group_reduce_add(sumf);
+        if (get_sub_group_local_id() == 0) {
+            for (int i = 4*(ne00/4); i < ne00; ++i) {
+                all_sum += (float) x[i] * y[i];
+            }
+            dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+        }
+    }
+
+}
+
+//------------------------------------------------------------------------------
+// mul_mat_f16_f32
+//------------------------------------------------------------------------------
+#define N_F16_F32 4
+
+#ifdef ADRENO_GPU
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_mul_mat_f16_f32(
+        global char * src0,
+        ulong offset0,
+        global char * src1,
+        ulong offset1,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        ulong nb00,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        int ne10,
+        int ne11,
+        int ne12,
+        ulong nb10,
+        ulong nb11,
+        ulong nb12,
+        ulong nb13,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+    src0 = (global char*)((global char*)src0 + offset0);
+    src1 = (global char*)((global char*)src1 + offset1);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    int r0 = get_group_id(0);
+    int rb = get_group_id(1)*N_F16_F32;
+    int im = get_group_id(2);
+
+    int i12 = im%ne12;
+    int i13 = im/ne12;
+
+    ulong offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
+
+    global half * x = (global half *) (src0 + offset_src0);
+
+    if (ne00 < 128) {
+        for (int row = 0; row < N_F16_F32; ++row) {
+            int r1 = rb + row;
+            if (r1 >= ne11) {
+                break;
+            }
+
+            ulong offset_src1 = r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
+
+            global float * y = (global float *) (src1 + offset_src1);
+
+            float sumf = 0;
+            for (int i = get_sub_group_local_id(); i < ne00; i += get_max_sub_group_size()) {
+                sumf += convert_float(x[i]) * y[i];
+            }
+
+            float all_sum = sub_group_reduce_add(sumf);
+            if (get_sub_group_local_id() == 0) {
+                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+            }
+        }
+    } else {
+        global half4 * x4 = (global half4 *)x;
+        for (int row = 0; row < N_F16_F32; ++row) {
+            int r1 = rb + row;
+            if (r1 >= ne11) {
+                break;
+            }
+
+            ulong offset_src1 = r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
+
+            global float  * y  = (global float  *) (src1 + offset_src1);
+            global float4 * y4 = (global float4 *) y;
+
+            float sumf = 0;
+            for (int i = get_sub_group_local_id(); i < ne00/4; i += get_max_sub_group_size()) {
+                sumf += convert_float(x4[i].s0) * y4[i].s0;
+                sumf += convert_float(x4[i].s1) * y4[i].s1;
+                sumf += convert_float(x4[i].s2) * y4[i].s2;
+                sumf += convert_float(x4[i].s3) * y4[i].s3;
+            }
+
+            float all_sum = sub_group_reduce_add(sumf);
+            if (get_sub_group_local_id() == 0) {
+                for (int i = 4*(ne00/4); i < ne00; ++i) {
+                    all_sum += (float) x[i] * y[i];
+                }
+                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+            }
+        }
+    }
+}
+
+//------------------------------------------------------------------------------
+// mul_mat_f16_f32_l4
+//------------------------------------------------------------------------------
+// Assumes row size (ne00) is a multiple of 4
+#ifdef ADRENO_GPU
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_mul_mat_f16_f32_l4(
+        global char * src0,
+        ulong offset0,
+        global char * src1,
+        ulong offset1,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        ulong nb00,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        int ne10,
+        int ne11,
+        int ne12,
+        ulong nb10,
+        ulong nb11,
+        ulong nb12,
+        ulong nb13,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+    src0 = (global char*)((global char*)src0 + offset0);
+    src1 = (global char*)((global char*)src1 + offset1);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    int nrows = ne11;
+    int r0 = get_group_id(0);
+    int im = get_group_id(2);
+
+    int i12 = im%ne12;
+    int i13 = im/ne12;
+
+    ulong offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
+
+    global half4 * x4 = (global half4 *) (src0 + offset_src0);
+
+    for (int r1 = 0; r1 < nrows; ++r1) {
+        ulong offset_src1 = r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
+
+        global float4 * y4 = (global float4 *) (src1 + offset_src1);
+
+        float sumf = 0;
+        for (int i = get_sub_group_local_id(); i < ne00/4; i += get_max_sub_group_size()) {
+            sumf += convert_float(x4[i].s0) * y4[i].s0;
+            sumf += convert_float(x4[i].s1) * y4[i].s1;
+            sumf += convert_float(x4[i].s2) * y4[i].s2;
+            sumf += convert_float(x4[i].s3) * y4[i].s3;
+        }
+
+        float all_sum = sub_group_reduce_add(sumf);
+        if (get_sub_group_local_id() == 0) {
+            dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+        }
+    }
+}
+
+//------------------------------------------------------------------------------
+// mul_vec_q_n_f32
+//------------------------------------------------------------------------------
+// function for calculate inner product between half a q4_0 block and 16 floats (yl), sumy is SUM(yl[i])
+// il indicates where the q4 quants begin (0 or QK4_0/4)
+// we assume that the yl's have been multiplied with the appropriate scale factor
+// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
+inline float block_q_4_0_dot_y(
+        global struct block_q4_0 * qb_curr,
+        float sumy,
+        private float * yl,
+        int il
+) {
+    float d = qb_curr->d;
+    float2 acc = 0.f;
+    global ushort * qs = ((global ushort *)qb_curr + 1 + il/2);
+    for (int i = 0; i < 8; i+=2) {
+        acc.s0 += yl[i + 0] * (qs[i / 2] & 0x000F)
+                + yl[i + 1] * (qs[i / 2] & 0x0F00);
+        acc.s1 += yl[i + 8] * (qs[i / 2] & 0x00F0)
+                + yl[i + 9] * (qs[i / 2] & 0xF000);
+    }
+    return d * (sumy * -8.f + acc.s0 + acc.s1);
+}
+
+#ifdef INTEL_GPU
+#define N_DST 4 // each SIMD group works on 4 rows
+#define N_SIMDGROUP 1 // number of SIMD groups in a thread group
+#define N_SIMDWIDTH 16 // assuming SIMD group size is 16
+#elif defined (ADRENO_GPU)
+#define N_DST 4
+#define N_SIMDGROUP 1
+#define N_SIMDWIDTH 64
+#endif
+
+inline void mul_vec_q_n_f32(
+        global void * src0,
+        global float * src1,
+        global float * dst,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne10,
+        int ne12,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+
+    const ulong nb = ne00/QK4_0;
+
+    int r0 = get_group_id(0);
+    int r1 = get_group_id(1);
+    int im = get_group_id(2);
+
+    // (r0 * N_SIMDGROUP + get_sub_group_id()) is essenatially the linear global
+    // id of a SIMD group in the grid.
+    int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
+
+    int i12 = im%ne12;
+    int i13 = im/ne12;
+
+    ulong offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+
+    global struct block_q4_0 * x = (global struct block_q4_0 *) src0 + offset0;
+    global float             * y = (global float             *) src1 + r1*ne10 + im*ne00*ne1;
+
+    float yl[16];       // src1 vector cache
+    float sumf[N_DST]={0.f};
+
+    int ix = get_sub_group_local_id()/2;
+    int il = 8*(get_sub_group_local_id()%2);
+
+    global float * yb = y + ix * QK4_0 + il;
+
+    // each thread in a SIMD group deals with half a block.
+    for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
+        float sumy = 0;
+        for (int i = 0; i < 8; i += 2) {
+            sumy += yb[i] + yb[i+1];
+            yl[i+0] = yb[i+ 0];
+            yl[i+1] = yb[i+ 1]/256.f;
+            sumy += yb[i+16] + yb[i+17];
+            yl[i+8] = yb[i+16]/16.f;
+            yl[i+9] = yb[i+17]/4096.f;
+        }
+
+        for (int row = 0; row < N_DST; row++) {
+            sumf[row] += block_q_4_0_dot_y(x+ib+row*nb, sumy, yl, il);
+        }
+
+        // One thread in a SIMD group (i.e., subgroup) handles a half block,
+        // hence then entire SIMD group handles SIMDWIDTH/2 blocks.
+        // y points to the activation matrix (of type float). Therefore for
+        // one thread, the # of blocks y should advance is SIMDWIDTH/2 (because
+        // SIMDWIDTH/2 blocks are processed by a SIMD group) - in terms of
+        // floats, it is QK4_0 * (SIMDWIDTH/2), where QK4_0 is the block size.
+        yb += QK4_0 * (N_SIMDWIDTH/2);
+    }
+
+    // The above does not work for Adreno - it produces incorrect results for
+    // row = 1, 2, 3 and only row = 0 gives the correct result.
+    // If N_DST is changed, the below array must be initialized accordingly.
+    // This also seems to perform better on Intel.
+    float tot[N_DST] = {
+        sub_group_reduce_add(sumf[0]), sub_group_reduce_add(sumf[1]),
+        sub_group_reduce_add(sumf[2]), sub_group_reduce_add(sumf[3])};
+    for (int row = 0; row < N_DST; ++row) {
+        if (get_sub_group_local_id() == 0 && first_row + row < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot[row];
+        }
+    }
+}
+
+#ifdef INTEL_GPU
+REQD_SUBGROUP_SIZE_16
+#elif defined (ADRENO_GPU)
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_mul_mat_q4_0_f32(
+        global void * src0,
+        ulong offset0,
+        global float * src1,
+        ulong offset1,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne10,
+        int ne12,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+    src0 = (global void*)((global char*)src0 + offset0);
+    src1 = (global float*)((global char*)src1 + offset1);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    mul_vec_q_n_f32(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
+}
+
+//
+// This variant unrolls the loops and uses vector types instead of pointers.
+// It improves performance on Adreno but not so much on Intel.
+//
+inline float block_q_4_0_dot_y_v(
+        global struct block_q4_0 * qb_curr,
+        float sumy,
+        float16 yl,
+        int il
+) {
+    float d = qb_curr->d;
+    float acc = 0.f;
+    global ushort * qs = ((global ushort *)qb_curr + 1 + il/2);
+
+    acc += yl.s0 * (qs[0] & 0x000F);
+    acc += yl.s1 * (qs[0] & 0x0F00);
+    acc += yl.s8 * (qs[0] & 0x00F0);
+    acc += yl.s9 * (qs[0] & 0xF000);
+
+    acc += yl.s2 * (qs[1] & 0x000F);
+    acc += yl.s3 * (qs[1] & 0x0F00);
+    acc += yl.sa * (qs[1] & 0x00F0);
+    acc += yl.sb * (qs[1] & 0xF000);
+
+    acc += yl.s4 * (qs[2] & 0x000F);
+    acc += yl.s5 * (qs[2] & 0x0F00);
+    acc += yl.sc * (qs[2] & 0x00F0);
+    acc += yl.sd * (qs[2] & 0xF000);
+
+    acc += yl.s6 * (qs[3] & 0x000F);
+    acc += yl.s7 * (qs[3] & 0x0F00);
+    acc += yl.se * (qs[3] & 0x00F0);
+    acc += yl.sf * (qs[3] & 0xF000);
+
+    return d * (sumy * -8.f + acc);
+}
+
+#undef N_DST
+#undef N_SIMDGROUP
+#undef N_SIMDWIDTH
+
+#ifdef INTEL_GPU
+#define N_DST 4 // each SIMD group works on 4 rows
+#define N_SIMDGROUP 1 // number of SIMD groups in a thread group
+#define N_SIMDWIDTH 16 // assuming SIMD group size is 16
+#elif defined (ADRENO_GPU)
+#define N_DST 4
+#define N_SIMDGROUP 1
+#define N_SIMDWIDTH 64
+#endif
+
+inline void mul_vec_q_n_f32_v(
+        global void * src0,
+        global float * src1,
+        global float * dst,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne10,
+        int ne12,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+    const ulong nb = ne00/QK4_0;
+
+    int r0 = get_group_id(0);
+    int r1 = get_group_id(1);
+    int im = get_group_id(2);
+
+    // (r0 * N_SIMDGROUP + get_sub_group_id()) is essenatially the linear global
+    // id of a SIMD group in the grid.
+    int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
+
+    int i12 = im%ne12;
+    int i13 = im/ne12;
+
+    ulong offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+
+    global struct block_q4_0 * x = (global struct block_q4_0 *) src0 + offset0;
+    global float             * y = (global float             *) src1 + r1*ne10 + im*ne00*ne1;
+
+    float16 yl;       // src1 vector cache
+    float4 sumf = (float4)(0.f, 0.f, 0.f, 0.f);
+
+    int ix = get_sub_group_local_id()/2;
+    int il = 8*(get_sub_group_local_id()%2);
+
+    global float * yb = y + ix * QK4_0 + il;
+
+    // each thread in a SIMD group deals with half a block.
+    for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
+        float sumy = 0;
+
+        sumy += yb[0];
+        sumy += yb[1];
+        sumy += yb[2];
+        sumy += yb[3];
+        sumy += yb[4];
+        sumy += yb[5];
+        sumy += yb[6];
+        sumy += yb[7];
+
+        sumy += yb[16];
+        sumy += yb[17];
+        sumy += yb[18];
+        sumy += yb[19];
+        sumy += yb[20];
+        sumy += yb[21];
+        sumy += yb[22];
+        sumy += yb[23];
+
+
+        yl.s0 = yb[0];
+        yl.s1 = yb[1]/256.f;
+
+        yl.s2 = yb[2];
+        yl.s3 = yb[3]/256.f;
+
+        yl.s4 = yb[4];
+        yl.s5 = yb[5]/256.f;
+
+        yl.s6 = yb[6];
+        yl.s7 = yb[7]/256.f;
+
+        yl.s8 = yb[16]/16.f;
+        yl.s9 = yb[17]/4096.f;
+
+        yl.sa = yb[18]/16.f;
+        yl.sb = yb[19]/4096.f;
+
+        yl.sc = yb[20]/16.f;
+        yl.sd = yb[21]/4096.f;
+
+        yl.se = yb[22]/16.f;
+        yl.sf = yb[23]/4096.f;
+
+        sumf.s0 += block_q_4_0_dot_y_v(x+ib+0*nb, sumy, yl, il);
+        sumf.s1 += block_q_4_0_dot_y_v(x+ib+1*nb, sumy, yl, il);
+        sumf.s2 += block_q_4_0_dot_y_v(x+ib+2*nb, sumy, yl, il);
+        sumf.s3 += block_q_4_0_dot_y_v(x+ib+3*nb, sumy, yl, il);
+
+        // One thread in a SIMD group (i.e., subgroup) handles a half block,
+        // hence then entire SIMD group handles SIMDWIDTH/2 blocks.
+        // y points to the activation matrix (of type float). Therefore for
+        // one thread, the # of blocks y should advance is SIMDWIDTH/2 (because
+        // SIMDWIDTH/2 blocks are processed by a SIMD group) - in terms of
+        // floats, it is QK4_0 * (SIMDWIDTH/2), where QK4_0 is the block size.
+        yb += QK4_0 * (N_SIMDWIDTH/2);
+    }
+
+    // The above does not work for Adreno - it produces incorrect results for
+    // row = 1, 2, 3 and only row = 0 gives the correct result.
+    // If N_DST is changed, the below array must be initialized accordingly.
+    // This also seems to perform better on Intel.
+    float4 tot = (float4)(
+        sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
+        sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3)
+    );
+
+    if (get_sub_group_local_id() == 0) {
+        if (first_row + 0 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
+        }
+        if (first_row + 1 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
+        }
+        if (first_row + 2 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
+        }
+        if (first_row + 3 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
+        }
+    }
+}
+
+#ifdef INTEL_GPU
+REQD_SUBGROUP_SIZE_16
+#elif defined (ADRENO_GPU)
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_mul_mat_q4_0_f32_v(
+        global void * src0,
+        ulong offset0,
+        global float * src1,
+        ulong offset1,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne10,
+        int ne12,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+    src0 = (global void*)((global char*)src0 + offset0);
+    src1 = (global float*)((global char*)src1 + offset1);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    mul_vec_q_n_f32_v(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
+}
+
+//------------------------------------------------------------------------------
+// kernel_convert_block_q4_0
+// Convert the block_q4_0 format to 2 separate arrays (AOS -> SOA).
+// This kernel does not deshuffle the bits.
+//------------------------------------------------------------------------------
+kernel void kernel_convert_block_q4_0(
+    global struct block_q4_0 * src0,
+    global uchar * dst_q,
+    global half  * dst_d
+) {
+    global struct block_q4_0 * b = (global struct block_q4_0 *) src0 + get_global_id(0);
+    global uchar * q = (global uchar *) dst_q + QK4_0/2*get_global_id(0);
+    global half  * d = (global half *) dst_d + get_global_id(0);
+
+    *d = b->d;
+
+    for (int i = 0; i < QK4_0/2; ++i) {
+        q[i] = b->qs[i];
+    }
+}
+
+kernel void kernel_restore_block_q4_0(
+    global uchar * src_q,
+    global half  * src_d,
+    global struct block_q4_0 * dst
+) {
+    global struct block_q4_0 * b = (global struct block_q4_0 *) dst + get_global_id(0);
+    global uchar * q = (global uchar *) src_q + QK4_0/2*get_global_id(0);
+    global half  * d = (global half *) src_d + get_global_id(0);
+
+    b->d = *d;
+    for (int i = 0; i < QK4_0/2; ++i) {
+        b->qs[i] = q[i];
+    }
+}
+
+//------------------------------------------------------------------------------
+// mul_vec_q_n_f32_flat
+//
+// This variation uses flat arrays (struct of arrays, SOA) representation for
+// quant tensors.
+//------------------------------------------------------------------------------
+
+// This function requires the original shuffled weights.
+// As a reminder, the original weights are shuffled so that (q[0], q[16]) are
+// packed together in a byte, so are (q[1], q[17]) and so on.
+inline float block_q_4_0_dot_y_flat(
+        global uchar * x,
+        global half  * dh,
+        float sumy,
+        float16 yl,
+        int il
+) {
+    float           d   = *dh;
+    global ushort * qs  = ((global ushort *)x + il/2);
+    float           acc = 0.f;
+
+    acc += yl.s0 * (qs[0] & 0x000F);
+    acc += yl.s1 * (qs[0] & 0x0F00);
+    acc += yl.s8 * (qs[0] & 0x00F0);
+    acc += yl.s9 * (qs[0] & 0xF000);
+
+    acc += yl.s2 * (qs[1] & 0x000F);
+    acc += yl.s3 * (qs[1] & 0x0F00);
+    acc += yl.sa * (qs[1] & 0x00F0);
+    acc += yl.sb * (qs[1] & 0xF000);
+
+    acc += yl.s4 * (qs[2] & 0x000F);
+    acc += yl.s5 * (qs[2] & 0x0F00);
+    acc += yl.sc * (qs[2] & 0x00F0);
+    acc += yl.sd * (qs[2] & 0xF000);
+
+    acc += yl.s6 * (qs[3] & 0x000F);
+    acc += yl.s7 * (qs[3] & 0x0F00);
+    acc += yl.se * (qs[3] & 0x00F0);
+    acc += yl.sf * (qs[3] & 0xF000);
+
+    return d * (sumy * -8.f + acc);
+}
+
+#undef N_DST
+#undef N_SIMDGROUP
+#undef N_SIMDWIDTH
+
+#ifdef INTEL_GPU
+#define N_DST 4 // each SIMD group works on 4 rows
+#define N_SIMDGROUP 1 // number of SIMD groups in a thread group
+#define N_SIMDWIDTH 16 // assuming SIMD group size is 32
+#elif defined (ADRENO_GPU)
+#define N_DST 4
+#define N_SIMDGROUP 1
+#define N_SIMDWIDTH 64
+#endif
+
+inline void mul_vec_q_n_f32_flat(
+        global uchar * src0_q,
+        global half  * src0_d,
+        global float * src1,
+        global float * dst,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne10,
+        int ne12,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+    const ulong nb = ne00/QK4_0;
+
+    int r0 = get_group_id(0);
+    int r1 = get_group_id(1);
+    int im = get_group_id(2);
+
+    // (r0 * N_SIMDGROUP + get_sub_group_id()) is the linear global id of
+    // a SIMD group in the grid. Each SIMD group produces N_DST values in the
+    // result, hence uses nb blocks, i.e., the offset becomes first_row*nb.
+    // Currently with llama2 7B, im is always 0.
+    // TODO: how to handle im/gqa*(nb*ne0)?
+    int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
+
+    int i12 = im%ne12;
+    int i13 = im/ne12;
+
+    // The number of scales is the same as the number of blocks.
+    ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+    // Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
+    ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
+
+    global uchar * x = (global uchar *) src0_q + offset0_q;
+    global half  * d = (global half  *) src0_d + offset0_d;
+    global float * y = (global float *) src1   + r1*ne10 + im*ne00*ne1;
+
+    float16 yl;
+    float4 sumf = (float4)(0.f, 0.f, 0.f, 0.f);
+
+    int ix = get_sub_group_local_id()/2;
+    int il = 8*(get_sub_group_local_id()%2);
+
+    global float * yb = y + ix*QK4_0 + il;
+
+    for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
+        float sumy = 0.f;
+
+        sumy += yb[0];
+        sumy += yb[1];
+        sumy += yb[2];
+        sumy += yb[3];
+        sumy += yb[4];
+        sumy += yb[5];
+        sumy += yb[6];
+        sumy += yb[7];
+
+        sumy += yb[16];
+        sumy += yb[17];
+        sumy += yb[18];
+        sumy += yb[19];
+        sumy += yb[20];
+        sumy += yb[21];
+        sumy += yb[22];
+        sumy += yb[23];
+
+        yl.s0 = yb[0];
+        yl.s1 = yb[1]/256.f;
+
+        yl.s2 = yb[2];
+        yl.s3 = yb[3]/256.f;
+
+        yl.s4 = yb[4];
+        yl.s5 = yb[5]/256.f;
+
+        yl.s6 = yb[6];
+        yl.s7 = yb[7]/256.f;
+
+        yl.s8 = yb[16]/16.f;
+        yl.s9 = yb[17]/4096.f;
+
+        yl.sa = yb[18]/16.f;
+        yl.sb = yb[19]/4096.f;
+
+        yl.sc = yb[20]/16.f;
+        yl.sd = yb[21]/4096.f;
+
+        yl.se = yb[22]/16.f;
+        yl.sf = yb[23]/4096.f;
+
+        sumf.s0 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 0*nb*QK4_0/2, d + ib + 0*nb, sumy, yl, il);
+        sumf.s1 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 1*nb*QK4_0/2, d + ib + 1*nb, sumy, yl, il);
+        sumf.s2 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 2*nb*QK4_0/2, d + ib + 2*nb, sumy, yl, il);
+        sumf.s3 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 3*nb*QK4_0/2, d + ib + 3*nb, sumy, yl, il);
+
+        yb += QK4_0 * (N_SIMDWIDTH/2);
+    }
+
+    float4 tot = (float4)(
+        sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
+        sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3)
+    );
+
+    if (get_sub_group_local_id() == 0) {
+        if (first_row + 0 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
+        }
+        if (first_row + 1 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
+        }
+        if (first_row + 2 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
+        }
+        if (first_row + 3 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
+        }
+    }
+}
+
+#ifdef INTEL_GPU
+REQD_SUBGROUP_SIZE_16
+#elif defined (ADRENO_GPU)
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_mul_mat_q4_0_f32_flat(
+        global uchar * src0_q,
+        global half  * src0_d,
+        global float * src1,
+        ulong offset1,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne10,
+        int ne12,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+    src1 = (global float*)((global char*)src1 + offset1);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    mul_vec_q_n_f32_flat(src0_q, src0_d, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
+}
+
+//
+// This variant outputs 8 values.
+//
+#undef N_DST
+#undef N_SIMDGROUP
+#undef N_SIMDWIDTH
+
+#ifdef INTEL_GPU
+#define N_DST 8 // each SIMD group works on 8 rows
+#define N_SIMDGROUP 1 // number of SIMD groups in a thread group
+#define N_SIMDWIDTH 16 // assuming SIMD group size is 32
+#elif defined (ADRENO_GPU)
+#define N_DST 8
+#define N_SIMDGROUP 1
+#define N_SIMDWIDTH 64
+#endif
+
+inline void mul_vec_q_n_f32_8x_flat(
+        global uchar * src0_q,
+        global half  * src0_d,
+        global float * src1,
+        global float * dst,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne10,
+        int ne12,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+    const ulong nb = ne00/QK4_0;
+
+    int r0 = get_group_id(0);
+    int r1 = get_group_id(1);
+    int im = get_group_id(2);
+
+    // (r0 * N_SIMDGROUP + get_sub_group_id()) is the linear global id of
+    // a SIMD group in the grid. Each SIMD group produces N_DST values in the
+    // result, hence uses nb blocks, i.e., the offset becomes first_row*nb.
+    // Currently with llama2 7B, im is always 0.
+    // TODO: how to handle im/gqa*(nb*ne0)?
+    int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
+
+    int i12 = im%ne12;
+    int i13 = im/ne12;
+
+    // The number of scales is the same as the number of blocks.
+    ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+    // Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
+    ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
+
+    global uchar * x = (global uchar *) src0_q + offset0_q;
+    global half  * d = (global half  *) src0_d + offset0_d;
+    global float * y = (global float *) src1   + r1*ne10 + im*ne00*ne1;
+
+    float16 yl;
+    float8 sumf = 0.f;
+
+    int ix = get_sub_group_local_id()/2;
+    int il = 8*(get_sub_group_local_id()%2);
+
+    global float * yb = y + ix*QK4_0 + il;
+
+    for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
+        float sumy = 0.f;
+
+        sumy += yb[0];
+        sumy += yb[1];
+        sumy += yb[2];
+        sumy += yb[3];
+        sumy += yb[4];
+        sumy += yb[5];
+        sumy += yb[6];
+        sumy += yb[7];
+
+        sumy += yb[16];
+        sumy += yb[17];
+        sumy += yb[18];
+        sumy += yb[19];
+        sumy += yb[20];
+        sumy += yb[21];
+        sumy += yb[22];
+        sumy += yb[23];
+
+        yl.s0 = yb[0];
+        yl.s1 = yb[1]/256.f;
+
+        yl.s2 = yb[2];
+        yl.s3 = yb[3]/256.f;
+
+        yl.s4 = yb[4];
+        yl.s5 = yb[5]/256.f;
+
+        yl.s6 = yb[6];
+        yl.s7 = yb[7]/256.f;
+
+        yl.s8 = yb[16]/16.f;
+        yl.s9 = yb[17]/4096.f;
+
+        yl.sa = yb[18]/16.f;
+        yl.sb = yb[19]/4096.f;
+
+        yl.sc = yb[20]/16.f;
+        yl.sd = yb[21]/4096.f;
+
+        yl.se = yb[22]/16.f;
+        yl.sf = yb[23]/4096.f;
+
+        sumf.s0 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 0*nb*QK4_0/2, d + ib + 0*nb, sumy, yl, il);
+        sumf.s1 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 1*nb*QK4_0/2, d + ib + 1*nb, sumy, yl, il);
+        sumf.s2 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 2*nb*QK4_0/2, d + ib + 2*nb, sumy, yl, il);
+        sumf.s3 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 3*nb*QK4_0/2, d + ib + 3*nb, sumy, yl, il);
+
+        sumf.s4 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 4*nb*QK4_0/2, d + ib + 4*nb, sumy, yl, il);
+        sumf.s5 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 5*nb*QK4_0/2, d + ib + 5*nb, sumy, yl, il);
+        sumf.s6 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 6*nb*QK4_0/2, d + ib + 6*nb, sumy, yl, il);
+        sumf.s7 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 7*nb*QK4_0/2, d + ib + 7*nb, sumy, yl, il);
+
+        yb += QK4_0 * (N_SIMDWIDTH/2);
+    }
+
+    float8 tot = (float8)(
+        sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
+        sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3),
+        sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5),
+        sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7)
+    );
+
+    if (get_sub_group_local_id() == 0) {
+        if (first_row + 0 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
+        }
+        if (first_row + 1 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
+        }
+        if (first_row + 2 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
+        }
+        if (first_row + 3 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
+        }
+
+        if (first_row + 4 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
+        }
+        if (first_row + 5 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
+        }
+        if (first_row + 6 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
+        }
+        if (first_row + 7 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
+        }
+    }
+}
+
+#ifdef INTEL_GPU
+REQD_SUBGROUP_SIZE_16
+#elif defined (ADRENO_GPU)
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_mul_mat_q4_0_f32_8x_flat(
+        global uchar * src0_q,
+        global half  * src0_d,
+        global float * src1,
+        ulong offset1,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne10,
+        int ne12,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+    src1 = (global float*)((global char*)src1 + offset1);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    mul_vec_q_n_f32_8x_flat(src0_q, src0_d, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
+}
diff --git a/ggml/src/ggml-opencl/kernels/ggml-opencl_cvt.cl b/ggml/src/ggml-opencl/kernels/ggml-opencl_cvt.cl
new file mode 100644
index 00000000000..e2024332f81
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_cvt.cl
@@ -0,0 +1,106 @@
+//------------------------------------------------------------------------------
+// This file is contains additional kernels for data conversion.
+// These kernels are used when loading the model, so its performance is less
+// important.
+//------------------------------------------------------------------------------
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#elif defined(cl_amd_fp16)
+#pragma OPENCL EXTENSION cl_amd_fp16 : enable
+#else
+#error "Half precision floating point not supportedby OpenCL implementation on your device."
+#endif
+
+#ifdef cl_khr_subgroups
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#elif defined(cl_intel_subgroups)
+#pragma OPENCL EXTENSION cl_intel_subgroups : enable
+#else
+#error "Subgroup not supported on your device."
+#endif
+
+#ifdef cl_intel_required_subgroup_size
+// Always use subgroup size of 32 on Intel.
+#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
+#define INTEL_GPU 1
+#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
+#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
+#elif defined(cl_qcom_reqd_sub_group_size)
+// Always use subgroups size of 64 on Adreno.
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+#define ADRENO_GPU 1
+#define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
+#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
+#else
+// TODO: do not know how to choose subgroup size on other GPUs.
+#error "Selecting subgroup size is not supported on your device."
+#endif
+
+#define QK4_0                   32
+#define QR4_0                   2
+#define QK4_1                   32
+#define QR4_1                   2
+#define QK5_0                   32
+#define QR5_0                   2
+#define QK5_1                   32
+#define QR5_1                   2
+#define QK8_0                   32
+#define QR8_0                   1
+#define QK_K                    256
+#define K_QUANTS_PER_ITERATION  2
+
+typedef char int8_t;
+typedef uchar uint8_t;
+typedef short int16_t;
+typedef ushort uint16_t;
+typedef int int32_t;
+typedef uint uint32_t;
+
+//------------------------------------------------------------------------------
+// block_q4_0
+//------------------------------------------------------------------------------
+struct block_q4_0
+{
+    half d;
+    uint8_t qs[QK4_0 / 2];
+};
+
+//------------------------------------------------------------------------------
+// mul_vec_q_n_f32_flat_noshuffle
+//
+// This variation uses flat arrays (struct of arrays, SOA) representation for
+// quant tensors. It also uses non shuffled bit order for weights.
+//
+// The shuffled version is kept in the original file because moving it here
+// seems to result in worse performance for adreno.
+//------------------------------------------------------------------------------
+
+kernel void kernel_convert_block_q4_0_noshuffle(
+    global struct block_q4_0 * src0,
+    global uchar * dst_q,
+    global half  * dst_d
+) {
+    global struct block_q4_0 * b = (global struct block_q4_0 *) src0 + get_global_id(0);
+    global uchar * q = (global uchar *) dst_q + QK4_0/2*get_global_id(0);
+    global half  * d = (global half *) dst_d + get_global_id(0);
+
+    *d = b->d;
+    for (int i = 0; i < QK4_0/4; ++i) {
+        uchar x0 = b->qs[2*i + 0];
+        uchar x1 = b->qs[2*i + 1];
+
+        q[i + 0      ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
+        q[i + QK4_0/4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
+
+#ifdef ADRENO_GPU
+        // Workaround for adreno - must have the following printf statement for
+        // the kernel to work properly. Otherwise it produces incorrect result.
+        // convert_uchar above also seems necessary.
+        // Compare against a large number so that it does not print anything.
+        // get_sub_group_local_id() also works.
+        if (get_global_id(0) == 65536*4096) {
+            printf("%04x - %02x\n", *(global ushort*)d, ((x0 & 0xF0) >> 4) | (x1 & 0xF0));
+        }
+#endif
+    }
+}
diff --git a/ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle.cl b/ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle.cl
new file mode 100644
index 00000000000..5e195411d69
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle.cl
@@ -0,0 +1,265 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable
+#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable
+#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+
+// assume
+#define QK4_0 32
+#define N_SIMDGROUP 4
+
+#define dequantizeBlockAccum_ns_sgbroadcast_1_hi(total_sums, bits4, scale, y) \
+    float shared_y; \
+    shared_y = sub_group_broadcast(y.s0, 0); \
+    total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s1, 0); \
+    total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s2, 0); \
+    total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s3, 0); \
+    total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s4, 0); \
+    total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s5, 0); \
+    total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s6, 0); \
+    total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s7, 0); \
+    total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s0, 1); \
+    total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s1, 1); \
+    total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s2, 1); \
+    total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s3, 1); \
+    total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s4, 1); \
+    total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s5, 1); \
+    total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s6, 1); \
+    total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s7, 1); \
+    total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+
+
+#define dequantizeBlockAccum_ns_sgbroadcast_1_lo(total_sums, bits4, scale, y) \
+    shared_y = sub_group_broadcast(y.s0, 2); \
+    total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s1, 2); \
+    total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s2, 2); \
+    total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s3, 2); \
+    total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s4, 2); \
+    total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s5, 2); \
+    total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s6, 2); \
+    total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s7, 2); \
+    total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s0, 3); \
+    total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s1, 3); \
+    total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s2, 3); \
+    total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s3, 3); \
+    total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s4, 3); \
+    total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s5, 3); \
+    total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s6, 3); \
+    total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s7, 3); \
+    total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+
+
+#define dequantizeBlockAccum_ns_sgbroadcast_8_hi(total_sums, bits4, scale, y) \
+    float8 shared_y; \
+    shared_y = sub_group_broadcast(y, 0); \
+    total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
+    total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
+    total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
+    total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
+    total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
+    total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
+    total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
+    total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
+    total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
+    total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
+    total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
+    total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
+    total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
+    total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
+    total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
+    total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
+    shared_y = sub_group_broadcast(y, 1); \
+    total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
+    total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
+    total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
+    total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
+    total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
+    total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
+    total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
+    total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
+    total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
+    total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
+    total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
+    total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
+    total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
+    total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
+    total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
+    total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
+
+
+#define dequantizeBlockAccum_ns_sgbroadcast_8_lo(total_sums, bits4, scale, y) \
+    shared_y = sub_group_broadcast(y, 2); \
+    total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
+    total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
+    total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
+    total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
+    total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
+    total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
+    total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
+    total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
+    total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
+    total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
+    total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
+    total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
+    total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
+    total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
+    total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
+    total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
+    shared_y = sub_group_broadcast(y, 3); \
+    total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
+    total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
+    total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
+    total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
+    total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
+    total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
+    total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
+    total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
+    total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
+    total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
+    total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
+    total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
+    total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
+    total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
+    total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
+    total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
+
+
+__attribute__((qcom_reqd_sub_group_size("full")))
+__kernel void kernel_gemv_noshuffle(
+        __read_only  image1d_buffer_t src0_q,  // quantized A
+        global half2  * src0_d,  // A scales
+        __read_only  image1d_buffer_t src1,    // B
+        ulong offset1,            // offset to B (0)
+        global float * dst,     // C
+        ulong offsetd,            // offset to C (0)
+        uint K,               // K
+        int ne01,               // M
+        int ne02,               // 1
+        int ne10,               // K
+        int ne12,               // 1
+        int ne0,                // M
+        int ne1,                // N
+        int r2,                 // 1
+        int r3)
+{
+    uint groupId = get_local_id(1);
+    uint gid     = get_global_id(0);
+    ushort slid    = get_sub_group_local_id();
+
+    __private uint4     regA;
+    __private half2     regS;
+    __private float8    regB;
+
+    __private float2 totalSum = (float2)(0.0f);
+
+    // loop along K in block granularity, skip 4 blocks every iter
+    for (uint k = groupId; k < (K / QK4_0); k += N_SIMDGROUP) {
+        regS = src0_d[gid + k * LINE_STRIDE_A]; // each fiber loads scale of two rows
+        // first 4 fibers in each wave load 8 B values to its private scope
+        if (slid < 4) {
+            regB.s0123 = read_imagef(src1, (slid * 2 + k * 8));
+            regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8));
+        }
+
+        // load half weights for two blocks in consecutive rows
+        regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x;
+        regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x;
+        regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x;
+        regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x;
+#ifdef VECTOR_SUB_GROUP_BROADCAT
+        dequantizeBlockAccum_ns_sgbroadcast_8_hi(totalSum, as_ushort8(regA), regS, regB);
+#else
+        dequantizeBlockAccum_ns_sgbroadcast_1_hi(totalSum, as_ushort8(regA), regS, regB);
+#endif // VECTOR_SUB_GROUP_BROADCAT
+
+        regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x;
+        regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x;
+        regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x;
+        regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x;
+#ifdef VECTOR_SUB_GROUP_BROADCAT
+        dequantizeBlockAccum_ns_sgbroadcast_8_lo(totalSum, as_ushort8(regA), regS, regB);
+#else
+        dequantizeBlockAccum_ns_sgbroadcast_1_lo(totalSum, as_ushort8(regA), regS, regB);
+#endif // VECTOR_SUB_GROUP_BROADCAT
+    }
+
+    // reduction in local memory, assumes #wave=4
+    __local float2 reduceLM[SIMDGROUP_WIDTH * 3];
+    if (groupId == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = totalSum;
+    if (groupId == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = totalSum;
+    if (groupId == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = totalSum;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 0 + slid];
+    if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
+    if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];
+
+    // 2 outputs per fiber in wave 0
+    if (groupId == 0) {
+        dst = (global float*)((global char*)dst + offsetd);
+        vstore2(totalSum, 0, &(dst[gid * 2]));
+    }
+
+}
diff --git a/ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle_general.cl b/ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle_general.cl
new file mode 100644
index 00000000000..5bdd4d06763
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle_general.cl
@@ -0,0 +1,271 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable
+#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable
+#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+
+// assume
+#define QK4_0 32
+#define N_SIMDGROUP 4
+
+#define dequantizeBlockAccum_ns_sgbroadcast_1_hi(total_sums, bits4, scale, y) \
+    float shared_y; \
+    shared_y = sub_group_broadcast(y.s0, 0); \
+    total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s1, 0); \
+    total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s2, 0); \
+    total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s3, 0); \
+    total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s4, 0); \
+    total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s5, 0); \
+    total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s6, 0); \
+    total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s7, 0); \
+    total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s0, 1); \
+    total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s1, 1); \
+    total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s2, 1); \
+    total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s3, 1); \
+    total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s4, 1); \
+    total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s5, 1); \
+    total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s6, 1); \
+    total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s7, 1); \
+    total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+
+
+#define dequantizeBlockAccum_ns_sgbroadcast_1_lo(total_sums, bits4, scale, y) \
+    shared_y = sub_group_broadcast(y.s0, 2); \
+    total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s1, 2); \
+    total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s2, 2); \
+    total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s3, 2); \
+    total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s4, 2); \
+    total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s5, 2); \
+    total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s6, 2); \
+    total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s7, 2); \
+    total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s0, 3); \
+    total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s1, 3); \
+    total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s2, 3); \
+    total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s3, 3); \
+    total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s4, 3); \
+    total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s5, 3); \
+    total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s6, 3); \
+    total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s7, 3); \
+    total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+
+
+#define dequantizeBlockAccum_ns_sgbroadcast_8_hi(total_sums, bits4, scale, y) \
+    float8 shared_y; \
+    shared_y = sub_group_broadcast(y, 0); \
+    total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
+    total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
+    total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
+    total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
+    total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
+    total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
+    total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
+    total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
+    total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
+    total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
+    total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
+    total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
+    total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
+    total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
+    total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
+    total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
+    shared_y = sub_group_broadcast(y, 1); \
+    total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
+    total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
+    total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
+    total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
+    total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
+    total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
+    total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
+    total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
+    total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
+    total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
+    total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
+    total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
+    total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
+    total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
+    total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
+    total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
+
+
+#define dequantizeBlockAccum_ns_sgbroadcast_8_lo(total_sums, bits4, scale, y) \
+    shared_y = sub_group_broadcast(y, 2); \
+    total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
+    total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
+    total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
+    total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
+    total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
+    total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
+    total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
+    total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
+    total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
+    total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
+    total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
+    total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
+    total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
+    total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
+    total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
+    total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
+    shared_y = sub_group_broadcast(y, 3); \
+    total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
+    total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
+    total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
+    total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
+    total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
+    total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
+    total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
+    total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
+    total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
+    total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
+    total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
+    total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
+    total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
+    total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
+    total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
+    total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
+
+
+__attribute__((qcom_reqd_sub_group_size("full")))
+__kernel void kernel_gemv_noshuffle(
+        __read_only  image1d_buffer_t src0_q,  // quantized A
+        global half2  * src0_d,  // A scales
+        __read_only  image1d_buffer_t src1,    // B
+        ulong offset1,            // offset to B (0)
+        global float * dst,     // C
+        ulong offsetd,            // offset to C (0)
+        int ne00,               // K
+        int ne01,               // M
+        int ne02,               // 1
+        int ne10,               // K
+        int ne12,               // 1
+        int ne0,                // M
+        int ne1,                // N
+        int r2,                 // 1
+        int r3)
+{
+    uint groupId = get_local_id(1);
+    uint gid     = get_global_id(0);
+    ushort slid    = get_sub_group_local_id();
+
+    uint K = ne00;
+    uint M = ne01;
+
+    uint LINE_STRIDE_A = M / 2;
+    uint BLOCK_STRIDE_A = N_SIMDGROUP * M;
+
+    __private uint4     regA;
+    __private half2     regS;
+    __private float8    regB;
+
+    __private float2 totalSum = (float2)(0.0f);
+
+    // loop along K in block granularity, skip 4 blocks every iter
+    for (uint k = groupId; k < (K / QK4_0); k += N_SIMDGROUP) {
+        regS = src0_d[gid + k * LINE_STRIDE_A]; // each fiber loads scale of two rows
+        // first 4 fibers in each wave load 8 B values to its private scope
+        if (slid < 4) {
+            regB.s0123 = read_imagef(src1, (slid * 2 + k * 8));
+            regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8));
+        }
+
+        // load half weights for two blocks in consecutive rows
+        regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x;
+        regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x;
+        regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x;
+        regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x;
+#ifdef VECTOR_SUB_GROUP_BROADCAT
+        dequantizeBlockAccum_ns_sgbroadcast_8_hi(totalSum, as_ushort8(regA), regS, regB);
+#else
+        dequantizeBlockAccum_ns_sgbroadcast_1_hi(totalSum, as_ushort8(regA), regS, regB);
+#endif // VECTOR_SUB_GROUP_BROADCAT
+
+        regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x;
+        regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x;
+        regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x;
+        regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x;
+#ifdef VECTOR_SUB_GROUP_BROADCAT
+        dequantizeBlockAccum_ns_sgbroadcast_8_lo(totalSum, as_ushort8(regA), regS, regB);
+#else
+        dequantizeBlockAccum_ns_sgbroadcast_1_lo(totalSum, as_ushort8(regA), regS, regB);
+#endif // VECTOR_SUB_GROUP_BROADCAT
+    }
+
+    // reduction in local memory, assumes #wave=4
+    __local float2 reduceLM[SIMDGROUP_WIDTH * 3];
+    if (groupId == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = totalSum;
+    if (groupId == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = totalSum;
+    if (groupId == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = totalSum;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 0 + slid];
+    if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
+    if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];
+
+    // 2 outputs per fiber in wave 0
+    if (groupId == 0) {
+        dst = (global float*)((global char*)dst + offsetd);
+        vstore2(totalSum, 0, &(dst[gid * 2]));
+    }
+
+}
diff --git a/ggml/src/ggml-opencl/kernels/ggml-opencl_mm.cl b/ggml/src/ggml-opencl/kernels/ggml-opencl_mm.cl
new file mode 100644
index 00000000000..e19e9a2f436
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_mm.cl
@@ -0,0 +1,1225 @@
+//------------------------------------------------------------------------------
+// This file is contains additional mulmat kernels
+// (and potentially other kernels).
+//------------------------------------------------------------------------------
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#elif defined(cl_amd_fp16)
+#pragma OPENCL EXTENSION cl_amd_fp16 : enable
+#else
+#error "Half precision floating point not supportedby OpenCL implementation on your device."
+#endif
+
+#ifdef cl_khr_subgroups
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#elif defined(cl_intel_subgroups)
+#pragma OPENCL EXTENSION cl_intel_subgroups : enable
+#else
+#error "Subgroup not supported on your device."
+#endif
+
+#ifdef cl_intel_required_subgroup_size
+// Always use subgroup size of 32 on Intel.
+#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
+#define INTEL_GPU 1
+#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
+#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
+#elif defined(cl_qcom_reqd_sub_group_size)
+// Always use subgroups size of 64 on Adreno.
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+#define ADRENO_GPU 1
+#define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
+#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
+#else
+// TODO: do not know how to choose subgroup size on other GPUs.
+#error "Selecting subgroup size is not supported on your device."
+#endif
+
+#define QK4_0                   32
+#define QR4_0                   2
+#define QK4_1                   32
+#define QR4_1                   2
+#define QK5_0                   32
+#define QR5_0                   2
+#define QK5_1                   32
+#define QR5_1                   2
+#define QK8_0                   32
+#define QR8_0                   1
+#define QK_K                    256
+#define K_QUANTS_PER_ITERATION  2
+
+typedef char int8_t;
+typedef uchar uint8_t;
+typedef short int16_t;
+typedef ushort uint16_t;
+typedef int int32_t;
+typedef uint uint32_t;
+
+//------------------------------------------------------------------------------
+// block_q4_0
+//------------------------------------------------------------------------------
+struct block_q4_0
+{
+    half d;
+    uint8_t qs[QK4_0 / 2];
+};
+
+//------------------------------------------------------------------------------
+// block_q6_K
+//------------------------------------------------------------------------------
+// 6-bit quantization
+// weight is represented as x = a * q
+// 16 blocks of 16 elements each
+// Effectively 6.5625 bits per weight
+typedef struct {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    half d;             // super-block scale
+} block_q6_K;
+
+//------------------------------------------------------------------------------
+// These are the variant for matmatmul, based on the matvecmul kernel with
+// flattened block_q4_0.
+//------------------------------------------------------------------------------
+
+// Common dot prod.
+inline float mm_block_q_4_0_dot_y_flat(
+        global uchar * x,
+        global half  * dh,
+        float sumy,
+        float16 yl,
+        int il
+) {
+    float           d   = *dh;
+    global ushort * qs  = ((global ushort *)x + il/2);
+    float           acc = 0.f;
+
+    acc += yl.s0 * (qs[0] & 0x000F);
+    acc += yl.s1 * (qs[0] & 0x0F00);
+    acc += yl.s8 * (qs[0] & 0x00F0);
+    acc += yl.s9 * (qs[0] & 0xF000);
+
+    acc += yl.s2 * (qs[1] & 0x000F);
+    acc += yl.s3 * (qs[1] & 0x0F00);
+    acc += yl.sa * (qs[1] & 0x00F0);
+    acc += yl.sb * (qs[1] & 0xF000);
+
+    acc += yl.s4 * (qs[2] & 0x000F);
+    acc += yl.s5 * (qs[2] & 0x0F00);
+    acc += yl.sc * (qs[2] & 0x00F0);
+    acc += yl.sd * (qs[2] & 0xF000);
+
+    acc += yl.s6 * (qs[3] & 0x000F);
+    acc += yl.s7 * (qs[3] & 0x0F00);
+    acc += yl.se * (qs[3] & 0x00F0);
+    acc += yl.sf * (qs[3] & 0xF000);
+
+    return d * (sumy * -8.f + acc);
+}
+
+#undef N_DST
+#undef N_SIMDGROUP
+#undef N_SIMDWIDTH
+
+#ifdef INTEL_GPU
+#define N_DST 8 // each SIMD group works on 8 rows (in weights matrix)
+#define N_SIMDGROUP 1 // number of SIMD groups in a thread group
+#define N_SIMDWIDTH 16 // assuming SIMD group size is 16
+#elif defined (ADRENO_GPU)
+#define N_DST 8
+#define N_SIMDGROUP 1
+#define N_SIMDWIDTH 64
+#endif
+//
+// This variant performs 1d blocking with 8x output.
+// Eeach simdgroup outputs 8 values on `n0` dim (row in the output matrix).
+//
+inline void mul_mat_q_n_f32_1d_8x_flat(
+        global uchar * src0_q,
+        global half  * src0_d,
+        global float * src1,
+        global float * dst,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne10,
+        int ne12,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+    const int nb = ne00/QK4_0;
+
+    int r0 = get_group_id(0);
+    int r1 = get_group_id(1);
+    int im = get_group_id(2);
+
+    // (r0 * N_SIMDGROUP + get_sub_group_id()) is the linear global id of
+    // a SIMD group in the grid. Each SIMD group produces N_DST values in the
+    // result, hence uses nb blocks, i.e., the offset becomes first_row*nb.
+    // Currently with llama2 7B, im is always 0.
+    // TODO: how to handle im/gqa*(nb*ne0)?
+    int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
+
+    int i12 = im%ne12;
+    int i13 = im/ne12;
+
+    // The number of scales is the same as the number of blocks.
+    ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+    // Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
+    ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
+
+    global uchar * x = (global uchar *) src0_q + offset0_q;
+    global half  * d = (global half  *) src0_d + offset0_d;
+    global float * y = (global float *) src1   + r1*ne10 + im*ne00*ne1;
+
+    float16 yl;
+    float8 sumf = (float8)(0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f);
+
+    int ix = get_sub_group_local_id()/2;
+    int il = 8*(get_sub_group_local_id()%2);
+
+    global float * yb = y + ix*QK4_0 + il;
+
+    for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
+        float sumy = 0.f;
+
+        sumy += yb[0];
+        sumy += yb[1];
+        sumy += yb[2];
+        sumy += yb[3];
+        sumy += yb[4];
+        sumy += yb[5];
+        sumy += yb[6];
+        sumy += yb[7];
+
+        sumy += yb[16];
+        sumy += yb[17];
+        sumy += yb[18];
+        sumy += yb[19];
+        sumy += yb[20];
+        sumy += yb[21];
+        sumy += yb[22];
+        sumy += yb[23];
+
+        yl.s0 = yb[0];
+        yl.s1 = yb[1]/256.f;
+
+        yl.s2 = yb[2];
+        yl.s3 = yb[3]/256.f;
+
+        yl.s4 = yb[4];
+        yl.s5 = yb[5]/256.f;
+
+        yl.s6 = yb[6];
+        yl.s7 = yb[7]/256.f;
+
+        yl.s8 = yb[16]/16.f;
+        yl.s9 = yb[17]/4096.f;
+
+        yl.sa = yb[18]/16.f;
+        yl.sb = yb[19]/4096.f;
+
+        yl.sc = yb[20]/16.f;
+        yl.sd = yb[21]/4096.f;
+
+        yl.se = yb[22]/16.f;
+        yl.sf = yb[23]/4096.f;
+
+        sumf.s0 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 0*nb*QK4_0/2, d + ib + 0*nb, sumy, yl, il);
+        sumf.s1 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 1*nb*QK4_0/2, d + ib + 1*nb, sumy, yl, il);
+        sumf.s2 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 2*nb*QK4_0/2, d + ib + 2*nb, sumy, yl, il);
+        sumf.s3 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 3*nb*QK4_0/2, d + ib + 3*nb, sumy, yl, il);
+
+        sumf.s4 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 4*nb*QK4_0/2, d + ib + 4*nb, sumy, yl, il);
+        sumf.s5 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 5*nb*QK4_0/2, d + ib + 5*nb, sumy, yl, il);
+        sumf.s6 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 6*nb*QK4_0/2, d + ib + 6*nb, sumy, yl, il);
+        sumf.s7 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 7*nb*QK4_0/2, d + ib + 7*nb, sumy, yl, il);
+
+        yb += QK4_0 * (N_SIMDWIDTH/2);
+    }
+
+    float8 tot = (float8)(
+        sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
+        sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3),
+        sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5),
+        sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7)
+    );
+
+    if (get_sub_group_local_id() == 0) {
+        if (first_row + 0 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
+        }
+        if (first_row + 1 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
+        }
+        if (first_row + 2 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
+        }
+        if (first_row + 3 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
+        }
+
+        if (first_row + 4 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
+        }
+        if (first_row + 5 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
+        }
+        if (first_row + 6 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
+        }
+        if (first_row + 7 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
+        }
+    }
+}
+
+#ifdef INTEL_GPU
+REQD_SUBGROUP_SIZE_16
+#elif defined (ADRENO_GPU)
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_mul_mat_q4_0_f32_1d_8x_flat(
+        global uchar * src0_q,
+        global half  * src0_d,
+        global float * src1,
+        ulong offset1,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne10,
+        int ne12,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+    src1 = (global float*)((global char*)src1 + offset1);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    mul_mat_q_n_f32_1d_8x_flat(src0_q, src0_d, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
+}
+
+#undef N_DST
+#undef N_SIMDGROUP
+#undef N_SIMDWIDTH
+
+#ifdef INTEL_GPU
+#define N_DST 16 // each SIMD group works on 8 rows (in weights matrix)
+#define N_SIMDGROUP 1 // number of SIMD groups in a thread group
+#define N_SIMDWIDTH 16 // assuming SIMD group size is 16
+#elif defined (ADRENO_GPU)
+#define N_DST 16
+#define N_SIMDGROUP 1
+#define N_SIMDWIDTH 64
+#endif
+//
+// This variant performs 1d blocking with 16x output.
+// Eeach simdgroup outputs 16 values on `n0` dim (row in the output matrix).
+//
+inline void mul_mat_q_n_f32_1d_16x_flat(
+        global uchar * src0_q,
+        global half  * src0_d,
+        global float * src1,
+        global float * dst,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne10,
+        int ne12,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+    const int nb = ne00/QK4_0;
+
+    int r0 = get_group_id(0);
+    int r1 = get_group_id(1);
+    int im = get_group_id(2);
+
+    // (r0 * N_SIMDGROUP + get_sub_group_id()) is the linear global id of
+    // a SIMD group in the grid. Each SIMD group produces N_DST values in the
+    // result, hence uses nb blocks, i.e., the offset becomes first_row*nb.
+    // Currently with llama2 7B, im is always 0.
+    // TODO: how to handle im/gqa*(nb*ne0)?
+    int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
+
+    int i12 = im%ne12;
+    int i13 = im/ne12;
+
+    // The number of scales is the same as the number of blocks.
+    ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+    // Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
+    ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
+
+    global uchar * x = (global uchar *) src0_q + offset0_q;
+    global half  * d = (global half  *) src0_d + offset0_d;
+    global float * y = (global float *) src1   + r1*ne10 + im*ne00*ne1;
+
+    float16 yl;
+    float16 sumf = (float16)(0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+                             0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f);
+
+    int ix = get_sub_group_local_id()/2;
+    int il = 8*(get_sub_group_local_id()%2);
+
+    global float * yb = y + ix*QK4_0 + il;
+
+    for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
+        float sumy = 0.f;
+
+        sumy += yb[0];
+        sumy += yb[1];
+        sumy += yb[2];
+        sumy += yb[3];
+        sumy += yb[4];
+        sumy += yb[5];
+        sumy += yb[6];
+        sumy += yb[7];
+
+        sumy += yb[16];
+        sumy += yb[17];
+        sumy += yb[18];
+        sumy += yb[19];
+        sumy += yb[20];
+        sumy += yb[21];
+        sumy += yb[22];
+        sumy += yb[23];
+
+        yl.s0 = yb[0];
+        yl.s1 = yb[1]/256.f;
+
+        yl.s2 = yb[2];
+        yl.s3 = yb[3]/256.f;
+
+        yl.s4 = yb[4];
+        yl.s5 = yb[5]/256.f;
+
+        yl.s6 = yb[6];
+        yl.s7 = yb[7]/256.f;
+
+        yl.s8 = yb[16]/16.f;
+        yl.s9 = yb[17]/4096.f;
+
+        yl.sa = yb[18]/16.f;
+        yl.sb = yb[19]/4096.f;
+
+        yl.sc = yb[20]/16.f;
+        yl.sd = yb[21]/4096.f;
+
+        yl.se = yb[22]/16.f;
+        yl.sf = yb[23]/4096.f;
+
+        sumf.s0 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 +  0*nb*QK4_0/2, d + ib +  0*nb, sumy, yl, il);
+        sumf.s1 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 +  1*nb*QK4_0/2, d + ib +  1*nb, sumy, yl, il);
+        sumf.s2 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 +  2*nb*QK4_0/2, d + ib +  2*nb, sumy, yl, il);
+        sumf.s3 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 +  3*nb*QK4_0/2, d + ib +  3*nb, sumy, yl, il);
+
+        sumf.s4 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 +  4*nb*QK4_0/2, d + ib +  4*nb, sumy, yl, il);
+        sumf.s5 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 +  5*nb*QK4_0/2, d + ib +  5*nb, sumy, yl, il);
+        sumf.s6 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 +  6*nb*QK4_0/2, d + ib +  6*nb, sumy, yl, il);
+        sumf.s7 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 +  7*nb*QK4_0/2, d + ib +  7*nb, sumy, yl, il);
+
+        sumf.s8 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 +  8*nb*QK4_0/2, d + ib +  8*nb, sumy, yl, il);
+        sumf.s9 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 +  9*nb*QK4_0/2, d + ib +  9*nb, sumy, yl, il);
+        sumf.sa += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 10*nb*QK4_0/2, d + ib + 10*nb, sumy, yl, il);
+        sumf.sb += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 11*nb*QK4_0/2, d + ib + 11*nb, sumy, yl, il);
+
+        sumf.sc += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 12*nb*QK4_0/2, d + ib + 12*nb, sumy, yl, il);
+        sumf.sd += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 13*nb*QK4_0/2, d + ib + 13*nb, sumy, yl, il);
+        sumf.se += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 14*nb*QK4_0/2, d + ib + 14*nb, sumy, yl, il);
+        sumf.sf += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 15*nb*QK4_0/2, d + ib + 15*nb, sumy, yl, il);
+
+        yb += QK4_0 * (N_SIMDWIDTH/2);
+    }
+
+    float16 tot = (float16)(
+        sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
+        sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3),
+        sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5),
+        sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7),
+
+        sub_group_reduce_add(sumf.s8), sub_group_reduce_add(sumf.s9),
+        sub_group_reduce_add(sumf.sa), sub_group_reduce_add(sumf.sb),
+        sub_group_reduce_add(sumf.sc), sub_group_reduce_add(sumf.sd),
+        sub_group_reduce_add(sumf.se), sub_group_reduce_add(sumf.sf)
+    );
+
+    if (get_sub_group_local_id() == 0) {
+        if (first_row + 0 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
+        }
+        if (first_row + 1 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
+        }
+        if (first_row + 2 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
+        }
+        if (first_row + 3 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
+        }
+
+        if (first_row + 4 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
+        }
+        if (first_row + 5 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
+        }
+        if (first_row + 6 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
+        }
+        if (first_row + 7 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
+        }
+
+        if (first_row + 8 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 8] = tot.s8;
+        }
+        if (first_row + 9 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 9] = tot.s9;
+        }
+        if (first_row + 10 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 10] = tot.sa;
+        }
+        if (first_row + 11 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 11] = tot.sb;
+        }
+
+        if (first_row + 12 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 12] = tot.sc;
+        }
+        if (first_row + 13 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 13] = tot.sd;
+        }
+        if (first_row + 14 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 14] = tot.se;
+        }
+        if (first_row + 15 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 15] = tot.sf;
+        }
+    }
+}
+
+#ifdef INTEL_GPU
+REQD_SUBGROUP_SIZE_16
+#elif defined (ADRENO_GPU)
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_mul_mat_q4_0_f32_1d_16x_flat(
+        global uchar * src0_q,
+        global half  * src0_d,
+        global float * src1,
+        ulong offset1,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne10,
+        int ne12,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+    src1 = (global float*)((global char*)src1 + offset1);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    mul_mat_q_n_f32_1d_16x_flat(src0_q, src0_d, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
+}
+
+//------------------------------------------------------------------------------
+// kernel_mul_mat_q4_0_f32_flat_v0
+//------------------------------------------------------------------------------
+inline float block_q_4_0_dot_y_flat_v2(
+    half   x,
+    half   d,
+    float  sumy,
+    float4 yl
+) {
+    uchar2 q = as_uchar2(x);
+    float acc = 0.0f;
+
+    acc += (q.s0 & 0x0F) * yl.s0;
+    acc += (q.s1 & 0x0F) * yl.s1;
+
+    acc += (q.s0 & 0xF0) * yl.s2;
+    acc += (q.s1 & 0xF0) * yl.s3;
+
+    return d * (sumy * -8.f + acc);;
+}
+
+inline float block_q_4_0_dot_y_flat_v4(
+    float  x,
+    half   d,
+    float  sumy,
+    float8 yl
+) {
+    uchar4 q = as_uchar4(x);
+    float acc = 0.0f;
+
+    acc += (q.s0 & 0x0F) * yl.s0;
+    acc += (q.s1 & 0x0F) * yl.s1;
+    acc += (q.s2 & 0x0F) * yl.s2;
+    acc += (q.s3 & 0x0F) * yl.s3;
+
+    acc += (q.s0 & 0xF0) * yl.s4;
+    acc += (q.s1 & 0xF0) * yl.s5;
+    acc += (q.s2 & 0xF0) * yl.s6;
+    acc += (q.s3 & 0xF0) * yl.s7;
+
+    return d * (sumy * -8.f + acc);;
+}
+
+inline float block_q_4_0_dot_y_flat_v8(
+    float2  x,
+    half    d,
+    float   sumy,
+    float16 yl
+) {
+    uchar8 q = as_uchar8(x);
+    float acc = 0.0f;
+
+    acc += (q.s0 & 0x0F) * yl.s0;
+    acc += (q.s1 & 0x0F) * yl.s1;
+    acc += (q.s2 & 0x0F) * yl.s2;
+    acc += (q.s3 & 0x0F) * yl.s3;
+    acc += (q.s4 & 0x0F) * yl.s4;
+    acc += (q.s5 & 0x0F) * yl.s5;
+    acc += (q.s6 & 0x0F) * yl.s6;
+    acc += (q.s7 & 0x0F) * yl.s7;
+
+    acc += (q.s0 & 0xF0) * yl.s8;
+    acc += (q.s1 & 0xF0) * yl.s9;
+    acc += (q.s2 & 0xF0) * yl.sa;
+    acc += (q.s3 & 0xF0) * yl.sb;
+    acc += (q.s4 & 0xF0) * yl.sc;
+    acc += (q.s5 & 0xF0) * yl.sd;
+    acc += (q.s6 & 0xF0) * yl.se;
+    acc += (q.s7 & 0xF0) * yl.sf;
+
+    return d * (sumy * -8.f + acc);;
+}
+
+#undef N_DST
+#undef N_SIMDGROUP
+#undef N_SIMDWIDTH
+
+#ifdef INTEL_GPU
+#define THREADS_PER_BLK 4   // Number of threads per block, or each thread process 1/THREADS_PER_BLK of a block
+#define N_DST           4
+#define N_SIMDGROUP     1
+#define N_SIMDWIDTH     16
+#elif defined (ADRENO_GPU)
+#define THREADS_PER_BLK 4
+#define N_DST           4
+#define N_SIMDGROUP     1
+#define N_SIMDWIDTH     64
+#endif
+
+#if THREADS_PER_BLK == 2                // Each thread processes 1/2 block
+#   define ACT_TY                       float16
+#   define Q_BLK_LD_TY                  float2
+#   define block_q_4_0_dot_y_flat       block_q_4_0_dot_y_flat_v8
+#elif THREADS_PER_BLK == 4              // Each thread processes 1/4 block
+#   define ACT_TY                       float8
+#   define Q_BLK_LD_TY                  float
+#   define block_q_4_0_dot_y_flat       block_q_4_0_dot_y_flat_v4
+#elif THREADS_PER_BLK == 8              // Each thread processes 1/8 block
+#   define ACT_TY                       float4
+#   define Q_BLK_LD_TY                  half
+#   define block_q_4_0_dot_y_flat       block_q_4_0_dot_y_flat_v2
+#endif
+
+#define BTYES_PER_THREAD_IN_BLK         (QK4_0/2/THREADS_PER_BLK)
+
+#if N_DST == 2
+#   define  SUM_TY                      float2
+#elif N_DST == 4
+#   define  SUM_TY                      float4
+#elif N_DST == 8
+#   define  SUM_TY                      float8
+#elif N_DST == 16
+#   define  SUM_TY                      float16
+#endif
+
+#ifdef INTEL_GPU
+REQD_SUBGROUP_SIZE_16
+#elif defined (ADRENO_GPU)
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_mul_mat_q4_0_f32_flat_v0(
+        global uchar * src0_q,
+        global half  * src0_d,
+        global float * src1,
+        ulong offset1,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne10,
+        int ne12,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+    src1 = (global float*)((global char*)src1 + offset1);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    const int nb = ne00/QK4_0;
+
+    int r0 = get_group_id(0);
+    int r1 = get_group_id(1);
+    int im = get_group_id(2);
+
+    int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
+
+    int i12 = im%ne12;
+    int i13 = im/ne12;
+
+    // The number of scales is the same as the number of blocks.
+    ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+    // Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
+    ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
+
+    global uchar * x = (global uchar *) src0_q + offset0_q;
+    global half  * d = (global half  *) src0_d + offset0_d;
+    global float * y = (global float *) src1   + r1*ne10 + im*ne00*ne1;
+
+    int ix = get_sub_group_local_id()/THREADS_PER_BLK;
+    int il = get_sub_group_local_id()%THREADS_PER_BLK;
+
+    global float * yb = y + ix*QK4_0 + BTYES_PER_THREAD_IN_BLK*il;
+
+    // Registers for caching activation
+    ACT_TY yl = 0.f;
+
+    // Registers for caching quants
+    Q_BLK_LD_TY q_blk_0 = 0, q_blk_1 = 0;
+#if N_DST == 4 || N_DST == 8 || N_DST == 16
+    Q_BLK_LD_TY q_blk_2 = 0, q_blk_3 = 0;
+#endif
+#if N_DST == 8 || N_DST == 16
+    Q_BLK_LD_TY q_blk_4 = 0, q_blk_5 = 0, q_blk_6 = 0, q_blk_7 = 0;
+#endif
+
+    // Partial sum
+    SUM_TY sumf = 0.f;
+
+    for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/THREADS_PER_BLK) {
+        float sumy = 0.f;
+
+        q_blk_0 = *(global Q_BLK_LD_TY*)(x + ib*QK4_0/2 + BTYES_PER_THREAD_IN_BLK*il + 0*nb*QK4_0/2);
+        q_blk_1 = *(global Q_BLK_LD_TY*)(x + ib*QK4_0/2 + BTYES_PER_THREAD_IN_BLK*il + 1*nb*QK4_0/2);
+#if N_DST == 4 || N_DST == 8 || N_DST == 16
+        q_blk_2 = *(global Q_BLK_LD_TY*)(x + ib*QK4_0/2 + BTYES_PER_THREAD_IN_BLK*il + 2*nb*QK4_0/2);
+        q_blk_3 = *(global Q_BLK_LD_TY*)(x + ib*QK4_0/2 + BTYES_PER_THREAD_IN_BLK*il + 3*nb*QK4_0/2);
+#endif
+#if N_DST == 8 || N_DST == 16
+        q_blk_4 = (*(global Q_BLK_LD_TY*)(x + ib*QK4_0/2 + BTYES_PER_THREAD_IN_BLK*il + 4*nb*QK4_0/2));
+        q_blk_5 = (*(global Q_BLK_LD_TY*)(x + ib*QK4_0/2 + BTYES_PER_THREAD_IN_BLK*il + 5*nb*QK4_0/2));
+        q_blk_6 = (*(global Q_BLK_LD_TY*)(x + ib*QK4_0/2 + BTYES_PER_THREAD_IN_BLK*il + 6*nb*QK4_0/2));
+        q_blk_7 = (*(global Q_BLK_LD_TY*)(x + ib*QK4_0/2 + BTYES_PER_THREAD_IN_BLK*il + 7*nb*QK4_0/2));
+#endif
+
+        // Load activation
+#if THREADS_PER_BLK == 2    // Each thread processes 1/2 block
+        yl.s01234567 = *(global float8 *)(yb);
+        yl.s89abcdef = *(global float8 *)(yb + 16);
+
+        sumy += yl.s0;
+        sumy += yl.s1;
+        sumy += yl.s2;
+        sumy += yl.s3;
+        sumy += yl.s4;
+        sumy += yl.s5;
+        sumy += yl.s6;
+        sumy += yl.s7;
+        sumy += yl.s8; yl.s8 /= 16.f;
+        sumy += yl.s9; yl.s9 /= 16.f;
+        sumy += yl.sa; yl.sa /= 16.f;
+        sumy += yl.sb; yl.sb /= 16.f;
+        sumy += yl.sc; yl.sc /= 16.f;
+        sumy += yl.sd; yl.sd /= 16.f;
+        sumy += yl.se; yl.se /= 16.f;
+        sumy += yl.sf; yl.sf /= 16.f;
+#elif THREADS_PER_BLK == 4  // Each thread processes 1/4 block
+        yl.s0123 = *(global float4 *)(yb);
+        yl.s4567 = *(global float4 *)(yb + 16);
+
+        sumy += yl.s0;
+        sumy += yl.s1;
+        sumy += yl.s2;
+        sumy += yl.s3;
+        sumy += yl.s4; yl.s4 /= 16.f;
+        sumy += yl.s5; yl.s5 /= 16.f;
+        sumy += yl.s6; yl.s6 /= 16.f;
+        sumy += yl.s7; yl.s7 /= 16.f;
+#elif THREADS_PER_BLK == 8  // Each thread processes 1/8 block
+        yl.s01 = *(global float2 *)(yb);
+        yl.s23 = *(global float2 *)(yb + 16);
+
+        sumy += yl.s0;
+        sumy += yl.s1;
+        sumy += yl.s2; yl.s2 /= 16.f;
+        sumy += yl.s3; yl.s3 /= 16.f;
+#endif
+
+        sumf.s0 += block_q_4_0_dot_y_flat(q_blk_0, *(d + ib + 0*nb), sumy, yl);
+        sumf.s1 += block_q_4_0_dot_y_flat(q_blk_1, *(d + ib + 1*nb), sumy, yl);
+#if N_DST == 4 || N_DST == 8 || N_DST == 16
+        sumf.s2 += block_q_4_0_dot_y_flat(q_blk_2, *(d + ib + 2*nb), sumy, yl);
+        sumf.s3 += block_q_4_0_dot_y_flat(q_blk_3, *(d + ib + 3*nb), sumy, yl);
+#endif
+#if N_DST == 8 || N_DST == 16
+        sumf.s4 += block_q_4_0_dot_y_flat(q_blk_4, *(d + ib + 4*nb), sumy, yl);
+        sumf.s5 += block_q_4_0_dot_y_flat(q_blk_5, *(d + ib + 5*nb), sumy, yl);
+        sumf.s6 += block_q_4_0_dot_y_flat(q_blk_6, *(d + ib + 6*nb), sumy, yl);
+        sumf.s7 += block_q_4_0_dot_y_flat(q_blk_7, *(d + ib + 7*nb), sumy, yl);
+#endif
+
+        yb += QK4_0 * (N_SIMDWIDTH/THREADS_PER_BLK);
+    }
+
+    SUM_TY tot = (SUM_TY)(
+          sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1)
+#if N_DST == 4 || N_DST == 8 || N_DST == 16
+        , sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3)
+#endif
+#if N_DST == 8 || N_DST == 16
+        , sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5)
+        , sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7)
+#endif
+    );
+
+    if (get_sub_group_local_id() == 0) {
+        if (first_row + 0 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
+        }
+        if (first_row + 1 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
+        }
+#if N_DST == 4 || N_DST == 8 || N_DST == 16
+        if (first_row + 2 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
+        }
+        if (first_row + 3 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
+        }
+#endif
+#if N_DST == 8 || N_DST == 16
+        if (first_row + 4 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
+        }
+        if (first_row + 5 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
+        }
+        if (first_row + 6 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
+        }
+        if (first_row + 7 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
+        }
+#endif
+    }
+}
+
+//------------------------------------------------------------------------------
+// Using image1d_buffer_t
+
+#if defined(cl_qcom_subgroup_shuffle)
+#pragma OPENCL EXTENSION cl_qcom_subgroup_shuffle : enable
+float qcom_sub_group_reduce_add(float sum) {
+    sum += qcom_sub_group_shuffle_down(sum, 32, CLK_SUB_GROUP_SHUFFLE_WIDTH_WAVE_SIZE_QCOM, 0.f);
+    sum += qcom_sub_group_shuffle_down(sum, 16, CLK_SUB_GROUP_SHUFFLE_WIDTH_WAVE_SIZE_QCOM, 0.f);
+    sum += qcom_sub_group_shuffle_down(sum,  8, CLK_SUB_GROUP_SHUFFLE_WIDTH_WAVE_SIZE_QCOM, 0.f);
+    sum += qcom_sub_group_shuffle_down(sum,  4, CLK_SUB_GROUP_SHUFFLE_WIDTH_WAVE_SIZE_QCOM, 0.f);
+    sum += qcom_sub_group_shuffle_down(sum,  2, CLK_SUB_GROUP_SHUFFLE_WIDTH_WAVE_SIZE_QCOM, 0.f);
+    sum += qcom_sub_group_shuffle_down(sum,  1, CLK_SUB_GROUP_SHUFFLE_WIDTH_WAVE_SIZE_QCOM, 0.f);
+    return sum;
+}
+#define sub_group_reduce_add qcom_sub_group_reduce_add
+#else
+#define sub_group_reduce_add sub_group_reduce_add
+#endif
+
+#undef THREADS_PER_BLK
+#undef N_DST
+#undef N_SIMDGROUP
+#undef N_SIMDWIDTH
+
+#ifdef INTEL_GPU
+#define THREADS_PER_BLK 4   // Number of threads per block, or each thread process 1/THREADS_PER_BLK of a block
+#define N_DST           4
+#define N_SIMDGROUP     1
+#define N_SIMDWIDTH     16
+#elif defined (ADRENO_GPU)
+#define THREADS_PER_BLK 4
+#define N_DST           4
+#define N_SIMDGROUP     1
+#define N_SIMDWIDTH     64
+#endif
+
+#if THREADS_PER_BLK == 2                // Each thread processes 1/2 block
+#   define ACT_TY                       float16
+#   define Q_BLK_LD_TY                  float2
+#   define EXTRACT_BLK_DATA(tmp, part)  *((float2*)&tmp + part)
+#   define block_q_4_0_dot_y_flat       block_q_4_0_dot_y_flat_v8
+#elif THREADS_PER_BLK == 4              // Each thread processes 1/4 block
+#   define ACT_TY                       float8
+#   define Q_BLK_LD_TY                  float
+#   define EXTRACT_BLK_DATA(tmp, part)  *((float*)&tmp + part)
+#   define block_q_4_0_dot_y_flat       block_q_4_0_dot_y_flat_v4
+#elif THREADS_PER_BLK == 8              // Each thread processes 1/8 block
+#   define ACT_TY                       float4
+#   define Q_BLK_LD_TY                  half
+#   define EXTRACT_BLK_DATA(tmp, part)  *((half*)&tmp + part)
+#   define block_q_4_0_dot_y_flat       block_q_4_0_dot_y_flat_v2
+#endif
+
+#define BTYES_PER_THREAD_IN_BLK         (QK4_0/2/THREADS_PER_BLK)
+
+#if N_DST == 2
+#   define  SUM_TY                      float2
+#elif N_DST == 4
+#   define  SUM_TY                      float4
+#elif N_DST == 8
+#   define  SUM_TY                      float8
+#elif N_DST == 16
+#   define  SUM_TY                      float16
+#endif
+
+#ifdef INTEL_GPU
+REQD_SUBGROUP_SIZE_16
+#elif defined (ADRENO_GPU)
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_mul_mat_q4_0_f32_flat_img_v0(
+        read_only image1d_buffer_t src0_q,
+        read_only image1d_buffer_t src0_d,
+        global float * src1,
+        ulong offset1,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne10,
+        int ne12,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+    src1 = (global float*)((global char*)src1 + offset1);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    const int nb = ne00/QK4_0;
+
+    int r0 = get_group_id(0);
+    int r1 = get_group_id(1);
+    int im = get_group_id(2);
+
+    int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
+
+    int i12 = im%ne12;
+    int i13 = im/ne12;
+
+    // The number of scales is the same as the number of blocks.
+    ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+    // Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
+    ulong offset0_q = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+
+    global float * y = (global float *) src1   + r1*ne10 + im*ne00*ne1;
+
+    int ix = get_sub_group_local_id()/THREADS_PER_BLK;
+    int il = get_sub_group_local_id()%THREADS_PER_BLK;
+
+    global float * yb = y + ix*QK4_0 + BTYES_PER_THREAD_IN_BLK*il;
+
+    // Registers for caching activation
+    ACT_TY yl = 0.f;
+
+    // Registers for caching quants
+    Q_BLK_LD_TY q_blk_0 = 0, q_blk_1 = 0;
+#if N_DST == 4 || N_DST == 8 || N_DST == 16
+    Q_BLK_LD_TY q_blk_2 = 0, q_blk_3 = 0;
+#endif
+#if N_DST == 8 || N_DST == 16
+    Q_BLK_LD_TY q_blk_4 = 0, q_blk_5 = 0, q_blk_6 = 0, q_blk_7 = 0;
+#endif
+
+    // Partial sum
+    SUM_TY sumf = 0.f;
+
+    for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/THREADS_PER_BLK) {
+        float sumy = 0.f;;
+
+        float4 tmp;
+        tmp = read_imagef(src0_q, offset0_q + ib + 0*nb);
+        q_blk_0 = EXTRACT_BLK_DATA(tmp, il);
+        tmp = read_imagef(src0_q, offset0_q + ib + 1*nb);
+        q_blk_1 = EXTRACT_BLK_DATA(tmp, il);
+#if N_DST == 4 || N_DST == 8 || N_DST == 16
+        tmp = read_imagef(src0_q, offset0_q + ib + 2*nb);
+        q_blk_2 = EXTRACT_BLK_DATA(tmp, il);
+        tmp = read_imagef(src0_q, offset0_q + ib + 3*nb);
+        q_blk_3 = EXTRACT_BLK_DATA(tmp, il);
+#endif
+#if N_DST == 8 || N_DST == 16
+        tmp = read_imagef(src0_q, offset0_q + ib + 4*nb);
+        q_blk_4 = EXTRACT_BLK_DATA(tmp, il);
+        tmp = read_imagef(src0_q, offset0_q + ib + 5*nb);
+        q_blk_5 = EXTRACT_BLK_DATA(tmp, il);
+        tmp = read_imagef(src0_q, offset0_q + ib + 6*nb);
+        q_blk_6 = EXTRACT_BLK_DATA(tmp, il);
+        tmp = read_imagef(src0_q, offset0_q + ib + 7*nb);
+        q_blk_7 = EXTRACT_BLK_DATA(tmp, il);
+#endif
+
+        // Load activation
+#if THREADS_PER_BLK == 2    // Each thread processes 1/2 block
+        yl.s01234567 = *(global float8 *)(yb);
+        yl.s89abcdef = *(global float8 *)(yb + 16);
+
+        sumy += yl.s0;
+        sumy += yl.s1;
+        sumy += yl.s2;
+        sumy += yl.s3;
+        sumy += yl.s4;
+        sumy += yl.s5;
+        sumy += yl.s6;
+        sumy += yl.s7;
+        sumy += yl.s8; yl.s8 /= 16.f;
+        sumy += yl.s9; yl.s9 /= 16.f;
+        sumy += yl.sa; yl.sa /= 16.f;
+        sumy += yl.sb; yl.sb /= 16.f;
+        sumy += yl.sc; yl.sc /= 16.f;
+        sumy += yl.sd; yl.sd /= 16.f;
+        sumy += yl.se; yl.se /= 16.f;
+        sumy += yl.sf; yl.sf /= 16.f;
+#elif THREADS_PER_BLK == 4  // Each thread processes 1/4 block
+        yl.s0123 = *(global float4 *)(yb);
+        yl.s4567 = *(global float4 *)(yb + 16);
+
+        sumy += yl.s0;
+        sumy += yl.s1;
+        sumy += yl.s2;
+        sumy += yl.s3;
+        sumy += yl.s4; yl.s4 /= 16.f;
+        sumy += yl.s5; yl.s5 /= 16.f;
+        sumy += yl.s6; yl.s6 /= 16.f;
+        sumy += yl.s7; yl.s7 /= 16.f;
+#elif THREADS_PER_BLK == 8  // Each thread processes 1/8 block
+        yl.s01 = *(global float2 *)(yb);
+        yl.s23 = *(global float2 *)(yb + 16);
+
+        sumy += yl.s0;
+        sumy += yl.s1;
+        sumy += yl.s2; yl.s2 /= 16.f;
+        sumy += yl.s3; yl.s3 /= 16.f;
+#endif
+
+        sumf.s0 += block_q_4_0_dot_y_flat(q_blk_0, read_imageh(src0_d, offset0_d + ib + 0*nb).s0, sumy, yl);
+        sumf.s1 += block_q_4_0_dot_y_flat(q_blk_1, read_imageh(src0_d, offset0_d + ib + 1*nb).s0, sumy, yl);
+#if N_DST == 4 || N_DST == 8 || N_DST == 16
+        sumf.s2 += block_q_4_0_dot_y_flat(q_blk_2, read_imageh(src0_d, offset0_d + ib + 2*nb).s0, sumy, yl);
+        sumf.s3 += block_q_4_0_dot_y_flat(q_blk_3, read_imageh(src0_d, offset0_d + ib + 3*nb).s0, sumy, yl);
+#endif
+#if N_DST == 8 || N_DST == 16
+        sumf.s4 += block_q_4_0_dot_y_flat(q_blk_4, read_imageh(src0_d, offset0_d + ib + 4*nb).s0, sumy, yl);
+        sumf.s5 += block_q_4_0_dot_y_flat(q_blk_5, read_imageh(src0_d, offset0_d + ib + 5*nb).s0, sumy, yl);
+        sumf.s6 += block_q_4_0_dot_y_flat(q_blk_6, read_imageh(src0_d, offset0_d + ib + 6*nb).s0, sumy, yl);
+        sumf.s7 += block_q_4_0_dot_y_flat(q_blk_7, read_imageh(src0_d, offset0_d + ib + 7*nb).s0, sumy, yl);
+#endif
+
+        yb += QK4_0 * (N_SIMDWIDTH/THREADS_PER_BLK);
+    }
+
+    SUM_TY tot = (SUM_TY)(
+          sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1)
+#if N_DST == 4 || N_DST == 8 || N_DST == 16
+        , sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3)
+#endif
+#if N_DST == 8 || N_DST == 16
+        , sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5)
+        , sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7)
+#endif
+    );
+
+    if (get_sub_group_local_id() == 0) {
+        if (first_row + 0 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
+        }
+        if (first_row + 1 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
+        }
+#if N_DST == 4 || N_DST == 8 || N_DST == 16
+        if (first_row + 2 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
+        }
+        if (first_row + 3 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
+        }
+#endif
+#if N_DST == 8 || N_DST == 16
+        if (first_row + 4 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
+        }
+        if (first_row + 5 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
+        }
+        if (first_row + 6 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
+        }
+        if (first_row + 7 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
+        }
+#endif
+    }
+}
+
+//------------------------------------------------------------------------------
+// kernel_mul_mv_q6_K_f32
+//------------------------------------------------------------------------------
+
+#undef N_DST
+#undef N_SIMDGROUP
+#undef N_SIMDWIDTH
+
+#ifdef INTEL_GPU
+#define N_DST 1 // number of rows each SIMD group works on
+#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
+#define N_SIMDWIDTH 16 // SIMD group size
+#elif defined (ADRENO_GPU)
+#define N_DST 1
+#define N_SIMDGROUP 2
+#define N_SIMDWIDTH 64
+#endif
+
+#define BLOCK_STRIDE (N_SIMDWIDTH/16) // number of blocks each subgroup processes
+
+#ifdef INTEL_GPU
+REQD_SUBGROUP_SIZE_16
+#elif defined (ADRENO_GPU)
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_mul_mv_q6_K_f32(
+        global void * src0,
+        ulong offset0,
+        global float * src1,
+        ulong offset1,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne10,
+        int ne12,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+    src0 = (global void*)((global char*)src0 + offset0);
+    src1 = (global float*)((global char*)src1 + offset1);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    uchar kmask1 = 0x03;
+    uchar kmask2 = 0x0C;
+    uchar kmask3 = 0x30;
+    uchar kmask4 = 0xC0;
+
+    int nb = ne00/QK_K;
+
+    int r0 = get_group_id(0);
+    int r1 = get_group_id(1);
+    int im = get_group_id(2);
+
+    int row = N_SIMDGROUP * r0 + get_sub_group_id();
+
+    int i12 = im%ne12;
+    int i13 = im/ne12;
+
+    ulong offset_src0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+
+    global block_q6_K * x = (global block_q6_K *) src0 + row*nb + offset_src0;
+    global float      * yy = (global float     *) src1 + r1*ne10 + im*ne00*ne1;
+
+    float sumf = 0;
+
+    // For Q6_K quantization, 16 values forms a subblock, 16 subblock forms a
+    // block. Values in a subblock shares a scale that is quantized with 8 bits;
+    // the entire block shares a single floating point scale.
+    // For work distribution, each thread processes a subblock (16 weights), hence
+    // 16 threads process a (super) block -- a subgroup thus handles SIMDWIDTH/16
+    // (super) blocks -- this is the block stride.
+    // The 16 threads that process a (super) block are split into 2 portions, each has
+    // 8 threads; each portion works on 8 subblocks.
+    // For subgroup of 16 threads, the entire subgroup works on a single (super) block
+    // before moving to the next (super) block. Thread0 - thread7 work on the
+    // first 8 subblocks; thread8 - thread15 works on the last 8 subblocks.
+    // Thread0 - thread3 work on subblocks 0, 2, 4, 6; thread4 - thread7 work on
+    // subblocks 1, 3, 5, 7. Each thread does not work on an entire subblock, but
+    // works on a total of 16 weight values.
+    int tid  = get_sub_group_local_id()/BLOCK_STRIDE; // first block_stride groups have tid=0
+    int ix   = get_sub_group_local_id()%BLOCK_STRIDE; // first block is 0..block_stride-1
+    int ip   = tid/8;   // first or second half of (super) block (0 or 1)
+    int il   = tid%8;   // each half has 8 parts, one per scale
+    int n    = 4;       // 4 scales at a time (and 4 sums)
+    int l0   = n*il;    // offset into half-block, 0..28
+    int is   = 8*ip + l0/16; // 0, 1, 8, 9
+
+    int y_offset = 128*ip + l0;
+    int q_offset_l = 64*ip + l0;
+    int q_offset_h = 32*ip + l0;
+
+    for (int i = ix; i < nb; i += BLOCK_STRIDE) {
+
+        global uint8_t * q1 = x[i].ql + q_offset_l;
+        global uint8_t * q2 = q1 + QK_K/8;
+        global uint8_t * qh = x[i].qh + q_offset_h;
+        global int8_t  * sc = x[i].scales + is;
+
+        global float * y = yy + i * QK_K + y_offset;
+
+        float dall = x[i].d;
+
+        float4 sums = {0.f, 0.f, 0.f, 0.f};
+
+        sums.s0 += y[0+ 0] * ((float)((q1[0] & 0xF) | ((qh[0] & kmask1) << 4)) - 32.f);
+        sums.s1 += y[0+32] * ((float)((q2[0] & 0xF) | ((qh[0] & kmask2) << 2)) - 32.f);
+        sums.s2 += y[0+64] * ((float)((q1[0]  >> 4) | ((qh[0] & kmask3) << 0)) - 32.f);
+        sums.s3 += y[0+96] * ((float)((q2[0]  >> 4) | ((qh[0] & kmask4) >> 2)) - 32.f);
+
+        sums.s0 += y[1+ 0] * ((float)((q1[1] & 0xF) | ((qh[1] & kmask1) << 4)) - 32.f);
+        sums.s1 += y[1+32] * ((float)((q2[1] & 0xF) | ((qh[1] & kmask2) << 2)) - 32.f);
+        sums.s2 += y[1+64] * ((float)((q1[1]  >> 4) | ((qh[1] & kmask3) << 0)) - 32.f);
+        sums.s3 += y[1+96] * ((float)((q2[1]  >> 4) | ((qh[1] & kmask4) >> 2)) - 32.f);
+
+        sums.s0 += y[2+ 0] * ((float)((q1[2] & 0xF) | ((qh[2] & kmask1) << 4)) - 32.f);
+        sums.s1 += y[2+32] * ((float)((q2[2] & 0xF) | ((qh[2] & kmask2) << 2)) - 32.f);
+        sums.s2 += y[2+64] * ((float)((q1[2]  >> 4) | ((qh[2] & kmask3) << 0)) - 32.f);
+        sums.s3 += y[2+96] * ((float)((q2[2]  >> 4) | ((qh[2] & kmask4) >> 2)) - 32.f);
+
+        sums.s0 += y[3+ 0] * ((float)((q1[3] & 0xF) | ((qh[3] & kmask1) << 4)) - 32.f);
+        sums.s1 += y[3+32] * ((float)((q2[3] & 0xF) | ((qh[3] & kmask2) << 2)) - 32.f);
+        sums.s2 += y[3+64] * ((float)((q1[3]  >> 4) | ((qh[3] & kmask3) << 0)) - 32.f);
+        sums.s3 += y[3+96] * ((float)((q2[3]  >> 4) | ((qh[3] & kmask4) >> 2)) - 32.f);
+
+        sumf += dall * (sums.s0 * sc[0] + sums.s1 * sc[2] + sums.s2 * sc[4] + sums.s3 * sc[6]);
+    }
+
+    float tot = sub_group_reduce_add(sumf);
+    if (get_sub_group_local_id() == 0) {
+        dst[r1*ne0 + im*ne0*ne1 + row] = tot;
+    }
+}
diff --git a/ggml/src/ggml-opencl/kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl b/ggml/src/ggml-opencl/kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl
new file mode 100644
index 00000000000..57768c80334
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl
@@ -0,0 +1,130 @@
+// src0_q, src0_d, src1 are transposed as a preprocessing step
+// 4-bit weights are transposed in groups of 4 (unsigned short int)
+// consider weights originally "next to each other", now "on top of each other"
+// each fiber computes a 8x4 tile of output elements
+// using unshuffled weights
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+
+__attribute__((qcom_reqd_sub_group_size("full")))
+kernel void kernel_mul_mat_Ab_Bi_8x4(
+        global const ushort * src0_q,       // quantized A
+        global const half  * src0_d,        // A scales
+        __read_only image1d_buffer_t src1,  // B (1d image)
+        global float * dst,                 // C
+        int m,                              // M
+        int n,                              // N with padding
+        int k,                              // K
+        int n_no_padding                    // N without padding
+) {
+
+    int m_4 = m >> 2;
+    int n_4 = n >> 2;
+
+    int gy = get_global_id(0);
+    int gx = get_global_id(1);
+    int gx_2 = gx << 2;
+
+    half8 c0 = 0, c1 = 0, c2 = 0, c3 = 0; // 8x4 output elements
+    half8 B; // registers for activations
+    half4 dequantized_weights; // registers for dequantized weights
+    __global const ushort* weight_ptr = src0_q + gx_2; // pointer for weights
+    __global const half* scale_ptr = src0_d + gx_2; // pointer for scales
+
+    for(int i=0; i<k; i+=4){ //loop through K dimension
+
+        B.s0123 = read_imageh(src1, gy*2 + (i)*(n_4));
+        B.s4567 = read_imageh(src1, gy*2 + (i)*(n_4)+1);
+
+        // keep (i/4) and (i/32) in parenthesis, rounds down
+        // load 4 consecutive groups of 4 weights
+        ushort4 bits4 = vload4(0, weight_ptr + (i/4)*(m)); // (i/4) because weights grouped in 4s
+
+        // load 4 consecutive scales
+        half4 scale = vload4(0, scale_ptr + (i/32)*(m));// (i/32) because 1 scale per 32 elements
+
+        // j=0
+        dequantized_weights.s0 = ((bits4.s0 & (0x000F)) - 8) * scale.s0; // dequantize a row of the 16 weights
+        dequantized_weights.s1 = ((bits4.s1 & (0x000F)) - 8) * scale.s1;
+        dequantized_weights.s2 = ((bits4.s2 & (0x000F)) - 8) * scale.s2;
+        dequantized_weights.s3 = ((bits4.s3 & (0x000F)) - 8) * scale.s3;
+        c0 += B * dequantized_weights.s0; // vector-scalar multiplication to accumulate
+        c1 += B * dequantized_weights.s1;
+        c2 += B * dequantized_weights.s2;
+        c3 += B * dequantized_weights.s3;
+
+        // j=1
+        B.s0123 = read_imageh(src1, gy*2 + (i+1)*(n_4));
+        B.s4567 = read_imageh(src1, gy*2 + (i+1)*(n_4)+1);
+        dequantized_weights.s0 = (((bits4.s0 & (0x00F0)) >> 4) - 8) * scale.s0; // dequantize a row of the 16 weights
+        dequantized_weights.s1 = (((bits4.s1 & (0x00F0)) >> 4) - 8) * scale.s1;
+        dequantized_weights.s2 = (((bits4.s2 & (0x00F0)) >> 4) - 8) * scale.s2;
+        dequantized_weights.s3 = (((bits4.s3 & (0x00F0)) >> 4) - 8) * scale.s3;
+        c0 += B * dequantized_weights.s0; //vector-scalar multiplication to accumulate
+        c1 += B * dequantized_weights.s1;
+        c2 += B * dequantized_weights.s2;
+        c3 += B * dequantized_weights.s3;
+
+        // j=2
+        B.s0123 = read_imageh(src1, gy*2 + (i+2)*(n_4));
+        B.s4567 = read_imageh(src1, gy*2 + (i+2)*(n_4)+1);
+        dequantized_weights.s0 = (((bits4.s0 & (0x0F00)) >> 8) - 8) * scale.s0; // dequantize a row of the 16 weights
+        dequantized_weights.s1 = (((bits4.s1 & (0x0F00)) >> 8) - 8) * scale.s1;
+        dequantized_weights.s2 = (((bits4.s2 & (0x0F00)) >> 8) - 8) * scale.s2;
+        dequantized_weights.s3 = (((bits4.s3 & (0x0F00)) >> 8) - 8) * scale.s3;
+        c0 += B * dequantized_weights.s0; // vector-scalar multiplication to accumulate
+        c1 += B * dequantized_weights.s1;
+        c2 += B * dequantized_weights.s2;
+        c3 += B * dequantized_weights.s3;
+
+        // j=3
+        B.s0123 = read_imageh(src1, gy*2 + (i+3)*(n_4));
+        B.s4567 = read_imageh(src1, gy*2 + (i+3)*(n_4)+1);
+        dequantized_weights.s0 = (((bits4.s0 & (0xF000)) >> 12) - 8) * scale.s0; // dequantize a row of the 16 weights
+        dequantized_weights.s1 = (((bits4.s1 & (0xF000)) >> 12) - 8) * scale.s1;
+        dequantized_weights.s2 = (((bits4.s2 & (0xF000)) >> 12) - 8) * scale.s2;
+        dequantized_weights.s3 = (((bits4.s3 & (0xF000)) >> 12) - 8) * scale.s3;
+        c0 += B * dequantized_weights.s0; // vector-scalar multiplication to accumulate
+        c1 += B * dequantized_weights.s1;
+        c2 += B * dequantized_weights.s2;
+        c3 += B * dequantized_weights.s3;
+    }
+
+    int idx = (gy<<3)*m + (gx<<2); // vectorized store 16 elements
+
+    // conditional check if store is to a valid location. Required when N is not a multiple of 8
+    // if statements allow registers to be reused for each store
+    // provides a performance boost due to reduced register footprint, which increases number of concurrent waves
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s0, c1.s0, c2.s0, c3.s0), 0, dst + idx);
+        idx += m;
+    }
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s1, c1.s1, c2.s1, c3.s1), 0, dst + idx);
+        idx += m;
+    }
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s2, c1.s2, c2.s2, c3.s2), 0, dst + idx);
+        idx += m;
+    }
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s3, c1.s3, c2.s3, c3.s3), 0, dst + idx);
+        idx += m;
+    }
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s4, c1.s4, c2.s4, c3.s4), 0, dst + idx);
+        idx += m;
+    }
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s5, c1.s5, c2.s5, c3.s5), 0, dst + idx);
+        idx += m;
+    }
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s6, c1.s6, c2.s6, c3.s6), 0, dst + idx);
+        idx += m;
+    }
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s7, c1.s7, c2.s7, c3.s7), 0, dst + idx);
+    }
+}
diff --git a/ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_16.cl b/ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_16.cl
new file mode 100644
index 00000000000..d59a0c05ddf
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_16.cl
@@ -0,0 +1,32 @@
+// 16-bit transpose, loading/storing an 8x8 tile of elements
+
+kernel void kernel_transpose_16(
+    __read_only image1d_buffer_t input,
+    __write_only image1d_buffer_t output,
+    const uint rows,
+    const uint cols
+) {
+
+    const int i = get_global_id(0);
+    const int j = get_global_id(1);
+    const int i_3 = i<<3;
+    const int j_3 = j<<3;
+
+    ushort8 temp0 = as_ushort8(read_imagef(input, (j_3+0)*cols+i));
+    ushort8 temp1 = as_ushort8(read_imagef(input, (j_3+1)*cols+i));
+    ushort8 temp2 = as_ushort8(read_imagef(input, (j_3+2)*cols+i));
+    ushort8 temp3 = as_ushort8(read_imagef(input, (j_3+3)*cols+i));
+    ushort8 temp4 = as_ushort8(read_imagef(input, (j_3+4)*cols+i));
+    ushort8 temp5 = as_ushort8(read_imagef(input, (j_3+5)*cols+i));
+    ushort8 temp6 = as_ushort8(read_imagef(input, (j_3+6)*cols+i));
+    ushort8 temp7 = as_ushort8(read_imagef(input, (j_3+7)*cols+i));
+
+    write_imagef(output, (i_3+0)*rows+j, as_float4((ushort8)(temp0.s0, temp1.s0, temp2.s0, temp3.s0, temp4.s0, temp5.s0, temp6.s0, temp7.s0)));
+    write_imagef(output, (i_3+1)*rows+j, as_float4((ushort8)(temp0.s1, temp1.s1, temp2.s1, temp3.s1, temp4.s1, temp5.s1, temp6.s1, temp7.s1)));
+    write_imagef(output, (i_3+2)*rows+j, as_float4((ushort8)(temp0.s2, temp1.s2, temp2.s2, temp3.s2, temp4.s2, temp5.s2, temp6.s2, temp7.s2)));
+    write_imagef(output, (i_3+3)*rows+j, as_float4((ushort8)(temp0.s3, temp1.s3, temp2.s3, temp3.s3, temp4.s3, temp5.s3, temp6.s3, temp7.s3)));
+    write_imagef(output, (i_3+4)*rows+j, as_float4((ushort8)(temp0.s4, temp1.s4, temp2.s4, temp3.s4, temp4.s4, temp5.s4, temp6.s4, temp7.s4)));
+    write_imagef(output, (i_3+5)*rows+j, as_float4((ushort8)(temp0.s5, temp1.s5, temp2.s5, temp3.s5, temp4.s5, temp5.s5, temp6.s5, temp7.s5)));
+    write_imagef(output, (i_3+6)*rows+j, as_float4((ushort8)(temp0.s6, temp1.s6, temp2.s6, temp3.s6, temp4.s6, temp5.s6, temp6.s6, temp7.s6)));
+    write_imagef(output, (i_3+7)*rows+j, as_float4((ushort8)(temp0.s7, temp1.s7, temp2.s7, temp3.s7, temp4.s7, temp5.s7, temp6.s7, temp7.s7)));
+}
diff --git a/ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_32.cl b/ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_32.cl
new file mode 100644
index 00000000000..914ec0193e7
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_32.cl
@@ -0,0 +1,25 @@
+// 32-bit transpose, loading/storing a 4x4 tile of elements
+
+kernel void kernel_transpose_32(
+    __read_only image1d_buffer_t input,
+    __write_only image1d_buffer_t output,
+    const uint rows,
+    const uint cols
+) {
+
+    const int i = get_global_id(0);
+    const int j = get_global_id(1);
+    const int i_2 = i<<2;
+    const int j_2 = j<<2;
+
+    float4 temp0 = read_imagef(input, (j_2+0)*cols+i);
+    float4 temp1 = read_imagef(input, (j_2+1)*cols+i);
+    float4 temp2 = read_imagef(input, (j_2+2)*cols+i);
+    float4 temp3 = read_imagef(input, (j_2+3)*cols+i);
+
+    write_imagef(output, (i_2+0)*rows+j, (float4)(temp0.s0, temp1.s0, temp2.s0, temp3.s0));
+    write_imagef(output, (i_2+1)*rows+j, (float4)(temp0.s1, temp1.s1, temp2.s1, temp3.s1));
+    write_imagef(output, (i_2+2)*rows+j, (float4)(temp0.s2, temp1.s2, temp2.s2, temp3.s2));
+    write_imagef(output, (i_2+3)*rows+j, (float4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3));
+
+}
diff --git a/ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_32_16.cl b/ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_32_16.cl
new file mode 100644
index 00000000000..d3bd1fabb76
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_32_16.cl
@@ -0,0 +1,35 @@
+// 32-bit transpose, loading/storing a 4x4 tile of elements
+// Only used for activations
+// converts to FP16
+// also adds zero padding for non multiple of 8 prompt lengths
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+kernel void kernel_transpose_32_16(__read_only image1d_buffer_t input, __write_only image1d_buffer_t output, const uint rows, const uint cols, const uint padded_rows) {
+
+    const int i = get_global_id(0);
+    const int j = get_global_id(1);
+    const int i_2 = i<<2;
+    const int j_2 = j<<2;
+    half4 temp0 = {0,0,0,0}; // initialize outputs to 0
+    half4 temp1 = {0,0,0,0};
+    half4 temp2 = {0,0,0,0};
+    half4 temp3 = {0,0,0,0};
+
+    if((j_2+0)*cols+i*4+3 < rows*cols*16){ // only load from a valid location. Otherwise keep register data as 0
+        temp0 = read_imageh(input, (j_2+0)*cols+i);
+    }
+    if((j_2+1)*cols+i*4+3 < rows*cols*16){
+        temp1 = read_imageh(input, (j_2+1)*cols+i);
+    }
+    if((j_2+2)*cols+i*4+3 < rows*cols*16){
+        temp2 = read_imageh(input, (j_2+2)*cols+i);
+    }
+    if((j_2+3)*cols+i*4+3 < rows*cols*16){
+        temp3 = read_imageh(input, (j_2+3)*cols+i);
+    }
+
+    write_imageh(output, (i_2+0)*padded_rows+j, (half4)(temp0.s0, temp1.s0, temp2.s0, temp3.s0)); // no conditionals for output, includes zero padding
+    write_imageh(output, (i_2+1)*padded_rows+j, (half4)(temp0.s1, temp1.s1, temp2.s1, temp3.s1));
+    write_imageh(output, (i_2+2)*padded_rows+j, (half4)(temp0.s2, temp1.s2, temp2.s2, temp3.s2));
+    write_imageh(output, (i_2+3)*padded_rows+j, (half4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3));
+}
diff --git a/ggml/src/ggml-sycl/common.cpp b/ggml/src/ggml-sycl/common.cpp
index 97ab2003c7f..88314a5cd73 100644
--- a/ggml/src/ggml-sycl/common.cpp
+++ b/ggml/src/ggml-sycl/common.cpp
@@ -12,6 +12,9 @@
 
 #include "common.hpp"
 
+#include "ggml-backend-impl.h"
+#include "ggml-impl.h"
+
 int get_current_device_id() {
   return dpct::dev_mgr::instance().current_device_id();
 }
@@ -28,11 +31,7 @@ void* ggml_sycl_host_malloc(size_t size) try {
 
   if (err != 0) {
     // clear the error
-    fprintf(
-        stderr,
-        "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
-        size / 1024.0 / 1024.0,
-        "syclGetErrorString is not supported");
+    GGML_LOG_ERROR("WARNING: failed to allocate %.2f MB of pinned memory: %s\n", size / 1024.0 / 1024.0,    "syclGetErrorString is not supported");
     return nullptr;
   }
 
@@ -66,17 +65,11 @@ int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block
 void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
                                  const ggml_tensor *src1, ggml_tensor *dst,
                                  const ggml_sycl_op_flatten_t op) try {
-    const int64_t nrows0 = ggml_nrows(src0);
 
     const bool use_src1 = src1 != nullptr;
-    const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
-
-    GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
-    GGML_ASSERT(              dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
-
-    ggml_tensor_extra_gpu * src0_extra =            (ggml_tensor_extra_gpu *) src0->extra;
-    ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
-    ggml_tensor_extra_gpu * dst_extra  =            (ggml_tensor_extra_gpu *)  dst->extra;
+    if(use_src1)
+      GGML_ASSERT(strcmp(src1->buffer->buft->iface.get_name(src1->buffer->buft), GGML_SYCL_NAME "_Split") != 0);
+    GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0);
 
     // dd = data device
     float * src0_ddf = (float *) src0->data;
diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp
index 4549fa5e95a..62b4cea3ada 100644
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@@ -26,7 +26,11 @@
 
 #define GGML_COMMON_DECL_SYCL
 #define GGML_COMMON_IMPL_SYCL
+/* suppress warning spam */
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wnested-anon-types"
 #include "ggml-common.h"
+#pragma clang diagnostic pop
 
 void* ggml_sycl_host_malloc(size_t size);
 void ggml_sycl_host_free(void* ptr);
@@ -626,6 +630,7 @@ struct bin_bcast_sycl {
                     });
             }
         }
+        GGML_UNUSED(ctx);
     }
 };
 
diff --git a/ggml/src/ggml-sycl/concat.cpp b/ggml/src/ggml-sycl/concat.cpp
index c90c452d878..a240968ad2e 100644
--- a/ggml/src/ggml-sycl/concat.cpp
+++ b/ggml/src/ggml-sycl/concat.cpp
@@ -47,7 +47,7 @@ static void concat_f32_dim1(const float *x, const float *y, float *dst,
   // operation
   int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
                    item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
-  if (item_ct1.get_group(1) < ne01) { // src0
+  if (item_ct1.get_group(1) < (size_t) ne01) { // src0
     int offset_src =
         nidx + item_ct1.get_group(1) * ne0 + item_ct1.get_group(0) * ne0 * ne01;
     dst[offset_dst] = x[offset_src];
@@ -70,7 +70,7 @@ static void concat_f32_dim2(const float *x, const float *y, float *dst,
   // operation
   int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
                    item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
-  if (item_ct1.get_group(0) < ne02) { // src0
+  if (item_ct1.get_group(0) < (size_t) ne02) { // src0
     int offset_src = nidx + item_ct1.get_group(1) * ne0 +
                      item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
     dst[offset_dst] = x[offset_src];
diff --git a/ggml/src/ggml-sycl/convert.cpp b/ggml/src/ggml-sycl/convert.cpp
index 5fd15e6cdcc..05b01db2d8b 100644
--- a/ggml/src/ggml-sycl/convert.cpp
+++ b/ggml/src/ggml-sycl/convert.cpp
@@ -424,7 +424,7 @@ static void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y,
     const int64_t global_id = item_ct1.get_local_id(2) + work_group_size * item_ct1.get_group(2);
 
     // make each work-item deal with more elements since sycl global range can not exceed max int
-    const src_t * x = (src_t *) vx;
+    const src_t * x = (const src_t *) vx;
     for (int64_t i = global_id; i < k; i += work_group_size * item_ct1.get_group_range(2)) {
         y[i] = x[i];
     }
diff --git a/ggml/src/ggml-sycl/dmmv.cpp b/ggml/src/ggml-sycl/dmmv.cpp
index 0c3dfaa37eb..0d097357ce7 100644
--- a/ggml/src/ggml-sycl/dmmv.cpp
+++ b/ggml/src/ggml-sycl/dmmv.cpp
@@ -1015,9 +1015,9 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
             break;
     }
 
-    (void) src1;
-    (void) dst;
-    (void) src1_ddq_i;
-    (void) src1_ncols;
-    (void) src1_padded_row_size;
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_ddq_i);
+    GGML_UNUSED(src1_ncols);
+    GGML_UNUSED(src1_padded_row_size);
 }
diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp
index d1b5dd87c69..e167948e7a3 100644
--- a/ggml/src/ggml-sycl/dpct/helper.hpp
+++ b/ggml/src/ggml-sycl/dpct/helper.hpp
@@ -1237,7 +1237,7 @@ namespace dpct
 
             std::map<byte_t *, allocation>::iterator get_map_iterator(const void *ptr)
             {
-                auto it = m_map.upper_bound((byte_t *)ptr);
+                auto it = m_map.upper_bound(const_cast<byte_t *>(reinterpret_cast<const byte_t *>(ptr)));
                 if (it == m_map.end())
                 {
                     // Not a virtual pointer.
diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp
index e5cd736eba9..d05a51f807c 100644
--- a/ggml/src/ggml-sycl/element_wise.cpp
+++ b/ggml/src/ggml-sycl/element_wise.cpp
@@ -237,7 +237,7 @@ void upscale_f32(const float  *x, float *dst, const int nb00, const int nb01,
     int i02 = i12 / sf2;
     int i03 = i13 / sf3;
 
-    dst[index] = *(float *)((char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
+    dst[index] = *(const float *)((const char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
 }
 
 void pad_f32(const float  *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02,
@@ -251,8 +251,7 @@ void pad_f32(const float  *x, float *dst, const int ne0, const int ne00, const i
     // operation
     int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
                      item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
-    if (nidx < ne00 && item_ct1.get_group(1) < ne01 &&
-        item_ct1.get_group(0) < ne02) {
+    if (nidx < ne00 && item_ct1.get_group(1) < (size_t) ne01 && item_ct1.get_group(0) < (size_t) ne02) {
         int offset_src = nidx + item_ct1.get_group(1) * ne00 +
                          item_ct1.get_group(0) * ne00 * ne01;
             dst[offset_dst] = x[offset_src];
@@ -520,9 +519,10 @@ inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, const ggml_tensor
 
     silu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
 
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
 }
 
 inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
@@ -535,9 +535,10 @@ inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, const ggml_tensor
 
     gelu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
 
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
 }
 inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
                                     const ggml_tensor *src1, ggml_tensor *dst,
@@ -550,9 +551,10 @@ inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, const ggml_
 
     gelu_quick_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
 
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
 }
 
 inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
@@ -564,9 +566,10 @@ inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, const ggml_tensor
     GGML_ASSERT( dst->type == GGML_TYPE_F32);
     tanh_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
 
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
 }
 
 inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
@@ -579,9 +582,10 @@ inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, const ggml_tensor
 
     relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
 
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
 }
 
 inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -595,9 +599,10 @@ inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, const ggml
 
     hardsigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
 
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
 }
 
 inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -610,9 +615,10 @@ inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, const ggml_t
 
     hardswish_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
 
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
 }
 
 inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -625,9 +631,10 @@ inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, const ggml_tensor
 
     exp_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
 
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
 }
 
 inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -640,9 +647,10 @@ inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, const ggml_tensor
 
     log_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
 
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
 }
 
 inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -655,9 +663,10 @@ inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, const ggml_ten
 
     sigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
 
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
 }
 
 inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -670,9 +679,10 @@ inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, const ggml_tensor
 
     sqrt_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
 
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
 }
 
 inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -685,9 +695,10 @@ inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, const ggml_tensor
 
     sin_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
 
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
 }
 
 inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -700,9 +711,10 @@ inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, const ggml_tensor
 
     cos_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
 
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
 }
 
 inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -715,9 +727,10 @@ inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, const ggml_tensor
 
     step_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
 
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
 }
 
 inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -730,9 +743,10 @@ inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, const ggml_tensor
 
     neg_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
 
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
 }
 
 inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -749,9 +763,10 @@ inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, const ggml_
 
     leaky_relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), negative_slope, main_stream);
 
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
 }
 
 inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
@@ -764,9 +779,10 @@ inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor
 
     sqr_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
 
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
 }
 
 inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -787,9 +803,10 @@ inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, const ggml_ten
                      dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3,
                      main_stream);
 
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
 }
 
 inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
@@ -805,9 +822,10 @@ inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, const ggml_tensor
         src0->ne[0], src0->ne[1], src0->ne[2],
         dst->ne[0], dst->ne[1], dst->ne[2], main_stream);
 
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
 }
 
 inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
@@ -827,7 +845,8 @@ inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, const ggml_tensor
 
     acc_f32_sycl(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream);
 
-    (void) dst;
+    GGML_UNUSED(dst);
+    GGML_UNUSED(ctx);
 }
 
 inline void ggml_sycl_op_add(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
diff --git a/ggml/src/ggml-sycl/gemm.hpp b/ggml/src/ggml-sycl/gemm.hpp
index 2ad9b36f419..3f0f34ad603 100644
--- a/ggml/src/ggml-sycl/gemm.hpp
+++ b/ggml/src/ggml-sycl/gemm.hpp
@@ -51,8 +51,8 @@ class DnnlGemmWrapper {
         const auto a_in_md = dnnl::memory::desc(a_dims, at, a_trans ? tag::ba : tag::ab);
         const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_trans ? tag::ba : tag::ab);
         const auto c_md = dnnl::memory::desc(c_dims, ct, tag::ab);
-        auto a_mem = dnnl::memory(a_in_md, eng, (void*)a);
-        auto b_mem = dnnl::memory(b_in_md, eng, (void*)b);
+        auto a_mem = dnnl::memory(a_in_md, eng, const_cast<void*>(a));
+        auto b_mem = dnnl::memory(b_in_md, eng, const_cast<void*>(b));
         auto matmul_pd = dnnl::matmul::primitive_desc(eng, a_in_md, b_in_md, c_md);
         auto c_mem = dnnl::memory(matmul_pd.dst_desc(), eng, c);
 
@@ -79,8 +79,8 @@ class DnnlGemmWrapper {
         const auto a_in_md = dnnl::memory::desc(a_dims, at, a_trans ? tag::ba : tag::ab);
         const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_trans ? tag::ba : tag::ab);
         const auto c_md = dnnl::memory::desc(c_dims, ct, tag::ab);
-        auto a_mem = dnnl::memory(a_in_md, eng, (void*)a);
-        auto b_mem = dnnl::memory(b_in_md, eng, (void*)b);
+        auto a_mem = dnnl::memory(a_in_md, eng, const_cast<void*>(a));
+        auto b_mem = dnnl::memory(b_in_md, eng, const_cast<void*>(b));
         auto matmul_pd = dnnl::matmul::primitive_desc(eng, a_in_md, b_in_md, c_md);
         auto c_mem = dnnl::memory(matmul_pd.dst_desc(), eng, c);
 
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index ae3baedc7b6..312ccfeb853 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -47,7 +47,7 @@ static ggml_sycl_device_info ggml_sycl_init() {
 
     info.device_count = dpct::dev_mgr::instance().device_count();
     if (info.device_count == 0) {
-        GGML_LOG_ERROR("%s: failed to initialize " GGML_SYCL_NAME ": %s\n", __func__);
+        GGML_LOG_ERROR("%s: failed to initialize: %s\n", GGML_SYCL_NAME, __func__);
         return info;
     }
 
@@ -64,7 +64,7 @@ static ggml_sycl_device_info ggml_sycl_init() {
 #else
     GGML_LOG_INFO("%s: SYCL_USE_XMX: no\n", __func__);
 #endif
-    GGML_LOG_INFO("%s: found %d " GGML_SYCL_NAME " devices:\n", __func__, info.device_count);
+    GGML_LOG_INFO("%s: found %d %s devices:\n", __func__, info.device_count, GGML_SYCL_NAME);
 
     for (int i = 0; i < info.device_count; ++i) {
         info.devices[i].vmm = 0;
@@ -137,7 +137,6 @@ void ggml_backend_sycl_print_sycl_devices() {
 
     for (int id = 0; id < device_count; ++id) {
       sycl::device device = dpct::dev_mgr::instance().get_device(id);
-      sycl::backend backend = device.get_backend();
       std::string backend_type = get_device_backend_and_type(device);
       int type_id = DeviceNums[backend_type]++;
       std::stringstream device_type;
@@ -289,10 +288,8 @@ ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
                                      ggml_tensor *tensor) try {
     ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
 
-    if (tensor->view_src != NULL && tensor->view_offs == 0) {
+    if (tensor->view_src != NULL) {
         assert(tensor->view_src->buffer->buft == buffer->buft);
-        tensor->backend = tensor->view_src->backend;
-        tensor->extra = tensor->view_src->extra;
         return;
     }
 
@@ -420,14 +417,12 @@ ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
         return true;
     }
     return false;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
+    GGML_UNUSED(buffer);
+} catch (const sycl::exception & exc) {
+    std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl;
+    std::exit(1);
 }
 
-
 static void ggml_backend_sycl_buffer_clear(ggml_backend_buffer_t buffer,
                                            uint8_t value) try {
      ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
@@ -542,7 +537,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
     auto dev_count = ggml_backend_sycl_get_device_count();
 
     if (device>=dev_count or device<0) {
-        printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
+        GGML_LOG_ERROR("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
             device, dev_count-1);
         GGML_ASSERT(device<dev_count);
     }
@@ -570,7 +565,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_conte
 
     int device = ctx->device;
     if (device>=ggml_sycl_info().device_count or device<0) {
-        printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
+        GGML_LOG_ERROR("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
             device, ggml_sycl_info().device_count-1);
         GGML_ASSERT(device<ggml_sycl_info().device_count);
     }
@@ -749,7 +744,7 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
             size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
         }
 
-        // FIXME: do not crash if cudaMalloc fails
+        // FIXME: do not crash if SYCL Buffer alloc fails
         // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
         ggml_sycl_set_device(i);
         const queue_ptr stream = ctx->streams[i];
@@ -791,7 +786,6 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
                 CHECK_TRY_ERROR(extra->events[i][is] = new sycl::event()));
         }
     }
-    tensor->backend = GGML_BACKEND_TYPE_GPU_SPLIT;
     tensor->extra = extra;
 }
 catch (sycl::exception const &exc) {
@@ -1092,10 +1086,7 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
     ggml_sycl_buffer buffer_pool[MAX_SYCL_BUFFERS] = {};
     size_t pool_size = 0;
 
-    explicit ggml_sycl_pool_leg(queue_ptr qptr_, int device_) :
-        qptr(qptr_),
-        device(device_) {
-    }
+    explicit ggml_sycl_pool_leg(queue_ptr qptr_, int device_) : device(device_), qptr(qptr_) {}
 
     ~ggml_sycl_pool_leg() {
         for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
@@ -1238,7 +1229,7 @@ static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy,
         zeros[i] = 0.f;
         qzeros[i] = 0;
     }
-    const TC xi = ix < kx ? *(TC *)&x[iy * kx + ix] : zeros;
+    const TC xi = ix < kx ? *(const TC *)&x[iy * kx + ix] : zeros;
     float sum = xi[0];
     float amax = sycl::fabs(xi[0]);
 #pragma unroll
@@ -1799,6 +1790,9 @@ static  void pool2d_nchw_kernel(
         switch (op) {
             case GGML_OP_POOL_AVG: res = 0; break;
             case GGML_OP_POOL_MAX: res = -FLT_MAX; break;
+            default:
+                res      = (To) sycl::nan(uint32_t(0));
+                break;
         }
 
         for (int i = bh; i < eh; i += 1) {
@@ -1817,6 +1811,9 @@ static  void pool2d_nchw_kernel(
                 switch (op) {
                     case GGML_OP_POOL_AVG: res += (cur / (kh * kw)); break;
                     case GGML_OP_POOL_MAX: res = sycl::max(res, (To)cur); break;
+                    default:
+                        res = (To) sycl::nan(uint32_t(0));
+                        break;
                 }
             }
         }
@@ -1855,7 +1852,8 @@ static void get_rows_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
                                  s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
                          });
 
-    (void) dst;
+    GGML_UNUSED(dst);
+    GGML_UNUSED(ctx);
 }
 
 template <typename src0_t>
@@ -1893,10 +1891,10 @@ static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, const ggml_tens
             });
     }
 
-    (void) dst;
+    GGML_UNUSED(dst);
+    GGML_UNUSED(ctx);
 }
 
-
 static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx,
                                    const int ky, const int kx_padded,
                                    queue_ptr stream) {
@@ -2348,12 +2346,22 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst,
 
     dpct::memcpy_direction kind;
     char * src_ptr;
-    if (src->backend == GGML_BACKEND_TYPE_CPU) {
+    if (ggml_backend_buffer_is_host(src->buffer)) {
         kind = dpct::host_to_device;
+        //GGML_SYCL_DEBUG("%s: Host buffer type src tensor\n", __func__);
         src_ptr = (char *) src->data;
         // GGML_SYCL_DEBUG("ggml_sycl_cpy_tensor_2d  GGML_BACKEND_TYPE_CPU src_ptr %p\n", src_ptr);
-    } else if (src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
-        GGML_ASSERT(src->backend != GGML_BACKEND_TYPE_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
+    } else if (ggml_backend_buffer_is_sycl(src->buffer)) {
+        // If buffer is a SYCL buffer
+        //GGML_SYCL_DEBUG("%s: SYCL buffer type src tensor\n", __func__);
+        kind    = dpct::device_to_device;
+        src_ptr = (char *) src->data;
+    } else if (ggml_backend_buffer_is_sycl_split(src->buffer)) {
+        /*
+        If buffer is a SYCL split buffer
+        */
+        //GGML_SYCL_DEBUG("%s: Split buffer type src tensor\n", __func__);
+        GGML_ASSERT(i1_low == 0 && i1_high == src->ne[1]);
         kind = dpct::device_to_device;
         ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
         int id;
@@ -2464,8 +2472,8 @@ static void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, const ggml_tens
 
     ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_repeat>>(ctx, dst, src0, dst, nullptr, src0_d, dst_d, main_stream);
 
-    (void) src1;
-    (void) src1_d;
+    GGML_UNUSED(src1);
+    GGML_UNUSED(src1_d);
 }
 
 
@@ -2484,17 +2492,18 @@ inline void ggml_sycl_op_mul_mat_sycl(
     const int64_t ne00 = src0->ne[0];
     const int64_t ne10 = src1->ne[0];
 
-    const int64_t ne0 = dst->ne[0];
 
     const int64_t row_diff = row_high - row_low;
 
     int id;
     SYCL_CHECK(
         CHECK_TRY_ERROR(id = get_current_device_id()));
-
+#if !GGML_SYCL_DNNL
+    const int64_t ne0 = dst->ne[0];
     // the main device has a larger memory buffer to hold the results from all GPUs
     // ldc == nrows of the matrix that cuBLAS writes into
     int ldc = id == ctx.device ? ne0 : row_diff;
+#endif
 
 #ifdef GGML_SYCL_F16
     bool use_fp16 = true;  // TODO(Yu) SYCL capability check
@@ -2531,9 +2540,9 @@ inline void ggml_sycl_op_mul_mat_sycl(
                                          : src1_as_f16.get();
         ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool(), row_diff * src1_ncols);
 
-        const sycl::half alpha_f16 = 1.0f;
-        const sycl::half beta_f16 = 0.0f;
 #if !GGML_SYCL_DNNL
+        const sycl::half alpha_f16 = 1.0f;
+        const sycl::half beta_f16  = 0.0f;
         SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
             *stream, oneapi::mkl::transpose::trans,
             oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
@@ -2570,9 +2579,9 @@ inline void ggml_sycl_op_mul_mat_sycl(
         const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
         const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();
 
-        const float alpha = 1.0f;
-        const float beta = 0.0f;
 #if !GGML_SYCL_DNNL
+        const float alpha = 1.0f;
+        const float beta  = 0.0f;
 #    ifdef GGML_SYCL_NVIDIA
         SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
             oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ *stream }, oneapi::mkl::transpose::trans,
@@ -2590,9 +2599,9 @@ inline void ggml_sycl_op_mul_mat_sycl(
             src0_ddf_i, DnnlGemmWrapper::to_dt<float>(), dst_dd_i, DnnlGemmWrapper::to_dt<float>());
 #endif
     }
-    (void) dst;
-    (void) src1_ddq_i;
-    (void) src1_padded_row_size;
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_ddq_i);
+    GGML_UNUSED(src1_padded_row_size);
 }
 catch (sycl::exception const &exc) {
   std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -2638,8 +2647,9 @@ static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, const ggml_tens
                                item_ct1);
         });
 
-    (void) src1;
-    (void) src1_dd;
+    GGML_UNUSED(src1);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
 }
 
 inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -2654,9 +2664,10 @@ inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, const ggml_tensor
 
     sum_rows_f32_sycl(src0_dd, dst_dd, ne, 1, main_stream);
 
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
 }
 
 inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -2673,9 +2684,10 @@ inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, const ggml_te
 
     sum_rows_f32_sycl(src0_dd, dst_dd, ncols, nrows, main_stream);
 
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
 }
 
 inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -2694,9 +2706,10 @@ inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, const ggml_ten
 
     argsort_f32_i32_sycl(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream);
 
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
 }
 
 inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -2713,9 +2726,10 @@ inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, const ggml_tens
 
     argmax_f32_i32_sycl(src0_dd, (int *)dst_dd, ncols, nrows, main_stream);
 
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
 }
 
 inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -2735,9 +2749,10 @@ inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, const gg
 
     diag_mask_inf_f32_sycl(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
 
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
 }
 
 inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
@@ -2758,9 +2773,10 @@ inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, const ggml_tenso
     */
     SYCL_CHECK(0);
 
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
 }
 
 inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
@@ -2783,9 +2799,10 @@ inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, const ggml_tenso
     */
     SYCL_CHECK(0);
 
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
 }
 
 static void ggml_sycl_set_peer_access(const int n_tokens, int main_device) {
@@ -2847,8 +2864,8 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
     const int nb2 = dst->nb[2];
     const int nb3 = dst->nb[3];
 
-    GGML_ASSERT(dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
-    GGML_ASSERT(src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
+    GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer));
+    GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src1->buffer));
     GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
 
     GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
@@ -2862,14 +2879,13 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
 
     ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
     ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-    ggml_tensor_extra_gpu *  dst_extra = (ggml_tensor_extra_gpu *)  dst->extra;
 
     const bool src0_is_contiguous = ggml_is_contiguous(src0);
     const bool src1_is_contiguous = ggml_is_contiguous(src1);
 
     int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
 
-    const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
+    const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
     GGML_ASSERT(!(split && ne02 > 1));
     GGML_ASSERT(!(split && ne03 > 1));
     GGML_ASSERT(!(split && ne02 < ne12));
@@ -3189,7 +3205,7 @@ static void ggml_sycl_mul_mat_vec_p021(ggml_backend_sycl_context & ctx, const gg
                                        const ggml_tensor *src1,
                                        ggml_tensor *dst) try {
     GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
-    GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
+    GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer));
     GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
     GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
     GGML_ASSERT(src0->type == GGML_TYPE_F16);
@@ -3222,7 +3238,7 @@ static void ggml_sycl_mul_mat_vec_nc(ggml_backend_sycl_context & ctx, const ggml
     GGML_ASSERT(!ggml_is_transposed(src0));
     GGML_ASSERT(!ggml_is_transposed(src1));
     GGML_ASSERT(!ggml_is_permuted(src0));
-    GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
+    GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer));
     GGML_ASSERT(src0->type == GGML_TYPE_F16);
     GGML_ASSERT(src1->type == GGML_TYPE_F32);
 
@@ -3284,12 +3300,11 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx,
                                              ggml_tensor *dst) try {
     GGML_ASSERT(!ggml_is_transposed(src0));
     GGML_ASSERT(!ggml_is_transposed(src1));
-    GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
+    GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer));
     GGML_ASSERT(src0->type == GGML_TYPE_F16);
 
     GGML_TENSOR_BINARY_OP_LOCALS
 
-    const int64_t ne_dst = ggml_nelements(dst);
 
     SYCL_CHECK(ggml_sycl_set_device(ctx.device));
     queue_ptr main_stream = ctx.stream();;
@@ -3397,6 +3412,7 @@ catch (sycl::exception const &exc) {
 
 inline bool ggml_sycl_supports_mmq(enum ggml_type type) {
     // TODO: accuracy issues in MMQ
+    GGML_UNUSED(type);
     return false;
 }
 
@@ -3772,7 +3788,7 @@ static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
         GGML_ABORT("fatal error");
     }
 
-    (void) dst;
+    GGML_UNUSED(dst);
 }
 catch (sycl::exception const &exc) {
   std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -3783,7 +3799,7 @@ catch (sycl::exception const &exc) {
 static void ggml_sycl_dup(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     // TODO: why do we pass dst as src1 here?
     ggml_sycl_cpy(ctx, src0, dst, nullptr);
-    (void) src1;
+    GGML_UNUSED(src1);
 }
 
 static void ggml_sycl_diag_mask_inf(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3828,13 +3844,16 @@ static void ggml_sycl_argmax(ggml_backend_sycl_context & ctx, const ggml_tensor
 }
 
 static void ggml_sycl_nop(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    (void) src0;
-    (void) src1;
-    (void) dst;
+    GGML_UNUSED(src0);
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(ctx);
 }
 
 void ggml_sycl_set_main_device(const int main_device) try {
-    if (dpct::get_current_device_id() == main_device) return;
+    if (dpct::get_current_device_id() == static_cast<unsigned int> (main_device)) {
+        return;
+    }
     check_allow_gpu_index(main_device);
     dpct::select_device(main_device);
 
@@ -4202,6 +4221,7 @@ try
 {
     ggml_backend_sycl_context *sycl_ctx =
         (ggml_backend_sycl_context *)backend->context;
+
     sycl::event *sycl_event = static_cast<sycl::event *>(event->context);
 
     const queue_ptr &stream = sycl_ctx->stream(sycl_ctx->device, 0);
@@ -4216,7 +4236,7 @@ catch (sycl::exception const &exc)
 }
 
 static void ggml_backend_sycl_event_wait(ggml_backend_t backend, ggml_backend_event_t event) try {
-    ggml_backend_sycl_context* sycl_ctx = static_cast<ggml_backend_sycl_context*>(backend->context);
+
     sycl::event* sycl_event = static_cast<sycl::event*>(event->context);
 
     if (ggml_backend_is_sycl(backend)) {
@@ -4475,7 +4495,16 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
         case GGML_OP_SOFT_MAX:
             return true;
         case GGML_OP_ROPE:
-            return ggml_is_contiguous(op->src[0]);
+            {
+                const int mode = ((const int32_t *) op->op_params)[2];
+                if (mode & GGML_ROPE_TYPE_MROPE) {
+                    return false;
+                }
+                if (mode & GGML_ROPE_TYPE_VISION) {
+                    return false;
+                }
+                return ggml_is_contiguous(op->src[0]);
+            }
         case GGML_OP_IM2COL:
             // TODO: add support for the new F32 operations
             return op->src[0]->type == GGML_TYPE_F16;
@@ -4616,14 +4645,14 @@ static ggml_backend_dev_t ggml_backend_sycl_reg_get_device(ggml_backend_reg_t re
 static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, const char *name) {
     GGML_UNUSED(reg);
 
-    // TODO: update to the current function signature
-    //if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
-    //    return (void *)ggml_backend_sycl_split_buffer_type;
-    //}
+    if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
+        return (void *)ggml_backend_sycl_split_buffer_type;
+    }
 
     // SYCL doesn't support registering host memory, left here for reference
     // "ggml_backend_register_host_buffer"
     // "ggml_backend_unregister_host_buffer"
+    GGML_UNUSED(name);
     return nullptr;
 }
 
diff --git a/ggml/src/ggml-sycl/im2col.cpp b/ggml/src/ggml-sycl/im2col.cpp
index 6a0a0fcd08c..6146a99edbe 100644
--- a/ggml/src/ggml-sycl/im2col.cpp
+++ b/ggml/src/ggml-sycl/im2col.cpp
@@ -120,6 +120,7 @@ void ggml_sycl_op_im2col(
         im2col_sycl(src1_dd, (float *)dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
     }
 
-    (void) src0;
-    (void) src0_dd;
+    GGML_UNUSED(src0);
+    GGML_UNUSED(src0_dd);
+    GGML_UNUSED(ctx);
 }
diff --git a/ggml/src/ggml-sycl/mmq.cpp b/ggml/src/ggml-sycl/mmq.cpp
index e952533d310..8ea82c940c7 100644
--- a/ggml/src/ggml-sycl/mmq.cpp
+++ b/ggml/src/ggml-sycl/mmq.cpp
@@ -813,7 +813,7 @@ load_tiles_q4_K(const void *__restrict__ vx, int *__restrict__ x_ql,
         x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
     }
 
-    const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
+    constexpr int blocks_per_tile_x_row = QI4_K > WARP_SIZE ? 1 : WARP_SIZE / QI4_K; // == 1 if QK_K == 256
     const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
 
 #pragma unroll
@@ -961,7 +961,7 @@ load_tiles_q5_K(const void *__restrict__ vx, int *__restrict__ x_ql,
         x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
     }
 
-    const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
+    constexpr int blocks_per_tile_x_row = QI5_K > WARP_SIZE ? 1 : WARP_SIZE / QI5_K; // == 1 if QK_K == 256
     const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
 
 #pragma unroll
@@ -1109,7 +1109,7 @@ load_tiles_q6_K(const void *__restrict__ vx, int *__restrict__ x_ql,
                                                  dpct::sub_sat());
     }
 
-    const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
+    constexpr int blocks_per_tile_x_row = QI6_K > WARP_SIZE ? 1 : WARP_SIZE / QI6_K; // == 1 if QK_K == 256
     const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
     float * x_dmf = (float *) x_dm;
 
@@ -3020,9 +3020,9 @@ void ggml_sycl_op_mul_mat_q(
             break;
     }
 
-    (void) src1;
-    (void) dst;
-    (void) src1_ddf_i;
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_ddf_i);
 }
 catch (sycl::exception const &exc) {
   std::cerr << exc.what() << "Exception caught at file:" << __FILE__
diff --git a/ggml/src/ggml-sycl/mmvq.cpp b/ggml/src/ggml-sycl/mmvq.cpp
index 7b10cf68814..221f65c21ea 100644
--- a/ggml/src/ggml-sycl/mmvq.cpp
+++ b/ggml/src/ggml-sycl/mmvq.cpp
@@ -753,11 +753,7 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
     const sycl::range<3> block_nums(1, 1, block_num_y);
     const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
     {
-
-        stream->submit([&](sycl::handler &cgh) {
-            auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
-            auto ksigns64_ptr_ct1 = &ksigns64[0];
-
+        stream->submit([&](sycl::handler & cgh) {
             cgh.parallel_for(
                 sycl::nd_range<3>(block_nums * block_dims, block_dims),
                 [=](sycl::nd_item<3> item_ct1)
@@ -780,9 +776,6 @@ static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy,
     {
 
         stream->submit([&](sycl::handler &cgh) {
-            auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
-            auto ksigns64_ptr_ct1 = &ksigns64[0];
-
             cgh.parallel_for(
                 sycl::nd_range<3>(block_nums * block_dims, block_dims),
                 [=](sycl::nd_item<3> item_ct1)
@@ -805,9 +798,6 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
     {
 
         stream->submit([&](sycl::handler &cgh) {
-            auto iq3xxs_grid_ptr_ct1 = &iq3xxs_grid[0];
-            auto ksigns64_ptr_ct1 = &ksigns64[0];
-
             cgh.parallel_for(
                 sycl::nd_range<3>(block_nums * block_dims, block_dims),
                 [=](sycl::nd_item<3> item_ct1)
@@ -830,8 +820,6 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
     {
 
         stream->submit([&](sycl::handler &cgh) {
-            auto iq3s_grid_ptr_ct1 = &iq3s_grid[0];
-
             cgh.parallel_for(
                 sycl::nd_range<3>(block_nums * block_dims, block_dims),
                 [=](sycl::nd_item<3> item_ct1)
@@ -854,9 +842,6 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
     {
 
         stream->submit([&](sycl::handler &cgh) {
-            auto iq1s_grid_ptr_ct1 = &iq1s_grid_gpu[0];
-            auto ksigns64_ptr_ct1 = &ksigns64[0];
-
             cgh.parallel_for(
                 sycl::nd_range<3>(block_nums * block_dims, block_dims),
                 [=](sycl::nd_item<3> item_ct1)
@@ -954,7 +939,7 @@ void ggml_sycl_op_mul_mat_vec_q(
     const size_t q8_1_bs = QK8_1;
     // the main device has a larger memory buffer to hold the results from all GPUs
     // nrows_dst == nrows of the matrix that the kernel writes into
-    const int64_t nrows_dst = id == ctx.device ? ne00 : row_diff;
+
     for (int i = 0; i < src1_ncols; i++)
     {
         const size_t src1_ddq_i_offset = i * src1_padded_col_size * q8_1_ts / q8_1_bs;
@@ -1023,7 +1008,8 @@ void ggml_sycl_op_mul_mat_vec_q(
             break;
         }
     }
-    (void) src1;
-    (void) dst;
-    (void) src1_ddf_i;
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_ddf_i);
+    GGML_UNUSED(ctx);
 }
diff --git a/ggml/src/ggml-sycl/norm.cpp b/ggml/src/ggml-sycl/norm.cpp
index 72d8fdb878c..9cf2be15575 100644
--- a/ggml/src/ggml-sycl/norm.cpp
+++ b/ggml/src/ggml-sycl/norm.cpp
@@ -31,7 +31,7 @@ static void norm_f32(const float* x, float* dst, const int ncols, const float ep
         */
         item_ct1.barrier(sycl::access::fence_space::local_space);
         mean_var = 0.f;
-        int nreduce = nwarps / WARP_SIZE;
+        size_t nreduce = nwarps / WARP_SIZE;
         for (size_t i = 0; i < nreduce; i += 1)
         {
             mean_var += s_sum[lane_id + i * WARP_SIZE];
@@ -55,7 +55,7 @@ static void group_norm_f32(const float* x, float* dst, const int group_size, con
     const int nthreads = item_ct1.get_local_range(2);
     const int nwarps = nthreads / WARP_SIZE;
     start += item_ct1.get_local_id(2);
-    int nreduce = nwarps / WARP_SIZE;
+    size_t nreduce = nwarps / WARP_SIZE;
 
     if (end >= ne_elements) {
         end = ne_elements;
@@ -163,7 +163,7 @@ static void rms_norm_f32(const float* x, float* dst, const int ncols, const floa
         converged control flow. You may need to adjust the code.
         */
         item_ct1.barrier(sycl::access::fence_space::local_space);
-        int nreduce = nwarps / WARP_SIZE;
+        size_t nreduce = nwarps / WARP_SIZE;
         tmp = 0.f;
         for (size_t i = 0; i < nreduce; i += 1)
         {
@@ -352,6 +352,7 @@ void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, const ggml_tensor*
     (void)src1;
     (void)dst;
     (void)src1_dd;
+    GGML_UNUSED(ctx);
 }
 
 void ggml_sycl_op_rms_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
diff --git a/ggml/src/ggml-sycl/rope.cpp b/ggml/src/ggml-sycl/rope.cpp
index 1f06f78fa3d..1244b231af7 100644
--- a/ggml/src/ggml-sycl/rope.cpp
+++ b/ggml/src/ggml-sycl/rope.cpp
@@ -269,7 +269,8 @@ void ggml_sycl_op_rope(
         }
     }
 
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
 }
diff --git a/ggml/src/ggml-sycl/softmax.cpp b/ggml/src/ggml-sycl/softmax.cpp
index 17a542e4903..a9b3fce0dc4 100644
--- a/ggml/src/ggml-sycl/softmax.cpp
+++ b/ggml/src/ggml-sycl/softmax.cpp
@@ -16,7 +16,7 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const
     const int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
     const int nthreads = block_size;
     const int nwarps = nthreads / WARP_SIZE;
-    int nreduce = nwarps / WARP_SIZE;
+    size_t nreduce = nwarps / WARP_SIZE;
     float slope = 1.0f;
 
     // ALiBi
@@ -53,8 +53,9 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const
     if (block_size > WARP_SIZE) {
         if (warp_id == 0) {
             buf[lane_id] = -INFINITY;
-            for (size_t i = 1; i < nreduce; i += 1)
+            for (size_t i = 1; i < nreduce; i += 1) {
                 buf[lane_id + i * WARP_SIZE] = -INFINITY;
+            }
         }
         item_ct1.barrier(sycl::access::fence_space::local_space);
 
@@ -63,8 +64,7 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const
         }
         item_ct1.barrier(sycl::access::fence_space::local_space);
         max_val = buf[lane_id];
-        for (size_t i = 1; i < nreduce; i += 1)
-        {
+        for (size_t i = 1; i < nreduce; i += 1) {
             max_val = std::max(max_val, buf[lane_id + i * WARP_SIZE]);
         }
         max_val = warp_reduce_max(max_val, item_ct1);
@@ -89,8 +89,9 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const
         item_ct1.barrier(sycl::access::fence_space::local_space);
         if (warp_id == 0) {
             buf[lane_id] = 0.f;
-            for (size_t i = 1; i < nreduce; i += 1)
+            for (size_t i = 1; i < nreduce; i += 1) {
                 buf[lane_id + i * WARP_SIZE] = 0.f;
+            }
         }
         item_ct1.barrier(sycl::access::fence_space::local_space);
 
@@ -100,8 +101,7 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const
         item_ct1.barrier(sycl::access::fence_space::local_space);
 
         tmp = buf[lane_id];
-        for (size_t i = 1; i < nreduce; i += 1)
-        {
+        for (size_t i = 1; i < nreduce; i += 1) {
             tmp += buf[lane_id + i * WARP_SIZE];
         }
         tmp = warp_reduce_sum(tmp, item_ct1);
diff --git a/ggml/src/ggml-sycl/tsembd.cpp b/ggml/src/ggml-sycl/tsembd.cpp
index d5c227cd1ab..2ffe3cca917 100644
--- a/ggml/src/ggml-sycl/tsembd.cpp
+++ b/ggml/src/ggml-sycl/tsembd.cpp
@@ -68,4 +68,5 @@ void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, const ggml
     const int max_period = dst->op_params[1];
 
     timestep_embedding_f32_sycl(src0_d, dst_d, src0->ne[0], dst->nb[1], dim, max_period, stream);
+    GGML_UNUSED(src1);
 }
diff --git a/ggml/src/ggml-sycl/wkv6.cpp b/ggml/src/ggml-sycl/wkv6.cpp
index 4c737f4bfce..75ddfb86ac0 100644
--- a/ggml/src/ggml-sycl/wkv6.cpp
+++ b/ggml/src/ggml-sycl/wkv6.cpp
@@ -59,7 +59,7 @@ static void rwkv_wkv_f32_kernel(
         float y = 0;
 
         // Process in chunks of 4 for better vectorization
-        sycl::float4 k4, r4, tf4, td4, s4, kv4;
+        sycl::float4 k4, r4, tf4, td4, s4;
         #pragma unroll
         for (int j = 0; j < head_size; j += 4) {
             // Load data in vec4 chunks
@@ -135,4 +135,7 @@ void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, const ggml_tensor* s
                 );
             });
     });
+
+    GGML_UNUSED(src0);
+    GGML_UNUSED(src1);
 }
diff --git a/ggml/src/ggml-threading.h b/ggml/src/ggml-threading.h
index ce975d880a7..dec2c8840aa 100644
--- a/ggml/src/ggml-threading.h
+++ b/ggml/src/ggml-threading.h
@@ -1,11 +1,13 @@
 #pragma once
 
+#include "ggml.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-void ggml_critical_section_start(void);
-void ggml_critical_section_end(void);
+GGML_API void ggml_critical_section_start(void);
+GGML_API void ggml_critical_section_end(void);
 
 #ifdef __cplusplus
 }
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index a8ae58ee2ce..c0a43631c87 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -163,7 +163,11 @@ struct vk_device_struct {
     uint32_t shader_core_count;
     bool uma;
     bool float_controls_rte_fp16;
-    bool coopmat2;
+
+    bool subgroup_size_control;
+    uint32_t subgroup_min_size;
+    uint32_t subgroup_max_size;
+    bool subgroup_require_full_support;
 
     bool coopmat_support;
     bool coopmat_acc_f32_support;
@@ -171,6 +175,7 @@ struct vk_device_struct {
     uint32_t coopmat_m;
     uint32_t coopmat_n;
     uint32_t coopmat_k;
+    bool coopmat2;
 
     size_t idx;
 
@@ -240,6 +245,7 @@ struct vk_device_struct {
     vk_pipeline pipeline_im2col_f32, pipeline_im2col_f32_f16;
     vk_pipeline pipeline_timestep_embedding_f32;
     vk_pipeline pipeline_pool2d_f32;
+    vk_pipeline pipeline_rwkv_wkv6_f32;
 
     // [2][2][2] is for {f16acc,f32acc}x{large,small_rows}x{unaligned, aligned}
     vk_pipeline pipeline_flash_attn_f32_f16_D64[GGML_TYPE_COUNT][2][2][2];
@@ -523,6 +529,13 @@ struct vk_op_pool2d_push_constants {
     int32_t p0; int32_t p1;
 };
 
+struct vk_op_rwkv_wkv6_push_constants {
+    uint32_t B;
+    uint32_t T;
+    uint32_t C;
+    uint32_t H;
+};
+
 // Allow pre-recording command buffers
 struct vk_staging_memcpy {
     vk_staging_memcpy(void * _dst, const void * _src, size_t _n) : dst(_dst), src(_src), n(_n) {}
@@ -749,8 +762,12 @@ static uint32_t compile_count = 0;
 static std::mutex compile_count_mutex;
 static std::condition_variable compile_count_cond;
 
-static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipeline, const std::string name, size_t spv_size, const void* spv_data, const std::string entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t> specialization_constants, uint32_t align, bool disable_robustness) {
-    VK_LOG_DEBUG("ggml_vk_create_pipeline(" << device->name << ", " << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")");
+static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipeline, const std::string name, size_t spv_size, const void* spv_data, const std::string entrypoint,
+                                         uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t> specialization_constants,
+                                         uint32_t align, bool disable_robustness, bool require_full_subgroups, uint32_t required_subgroup_size) {
+    VK_LOG_DEBUG("ggml_vk_create_pipeline(" << device->name << ", " << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size <<
+                 ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align <<
+                 ", " << disable_robustness << ", " << require_full_subgroups << ", " << required_subgroup_size << ")");
     GGML_ASSERT(parameter_count > 0);
     GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
 
@@ -809,14 +826,28 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
         specialization_constants.data()
     );
 
+    vk::PipelineShaderStageCreateFlags pipeline_shader_stage_create_flags{};
+
+    if (device->subgroup_require_full_support && require_full_subgroups) {
+        pipeline_shader_stage_create_flags |= vk::PipelineShaderStageCreateFlagBits::eRequireFullSubgroupsEXT;
+    }
+
     vk::PipelineShaderStageCreateInfo pipeline_shader_create_info(
-            vk::PipelineShaderStageCreateFlags(),
+            pipeline_shader_stage_create_flags,
             vk::ShaderStageFlagBits::eCompute,
             pipeline->shader_module,
             entrypoint.c_str(),
             &specialization_info);
+
+    vk::PipelineShaderStageRequiredSubgroupSizeCreateInfoEXT pipeline_shader_stage_required_subgroup_size_create_info;
+    pipeline_shader_stage_required_subgroup_size_create_info.requiredSubgroupSize = required_subgroup_size;
+    if (device->subgroup_size_control && required_subgroup_size > 0) {
+        GGML_ASSERT(device->subgroup_min_size <= required_subgroup_size && required_subgroup_size <= device->subgroup_max_size);
+        pipeline_shader_create_info.setPNext(&pipeline_shader_stage_required_subgroup_size_create_info);
+    }
+
     vk::ComputePipelineCreateInfo compute_pipeline_create_info(
-        vk::PipelineCreateFlags(),
+        vk::PipelineCreateFlags{},
         pipeline_shader_create_info,
         pipeline->layout);
 
@@ -1340,7 +1371,7 @@ static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vec
     // Needs to be kept up to date on shader changes
     const uint32_t bank_conflict_offset = device->coopmat_support ? 8 : 1;
     const uint32_t type_size = device->fp16 ? sizeof(ggml_fp16_t) : sizeof(float);
-    const uint32_t warps = warptile[0] / device->subgroup_size;
+    const uint32_t warps = warptile[0] / warptile[10];
 
     const uint32_t load_bufs = (warptile[1] + warptile[2]) * (warptile[3] + bank_conflict_offset) * type_size;
     const uint32_t mmid_row_ids = mul_mat_id ? 3072 * sizeof(uint32_t) : 0;
@@ -1354,8 +1385,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
 
     std::cerr << "ggml_vulkan: Compiling shaders";
 
-    // some shaders require the subgroup size to be 16 or larger
+    // some shaders have a minimum subgroup size
     const uint32_t subgroup_size_16 = std::max(device->subgroup_size, 16u);
+    const uint32_t subgroup_size_32 = std::max(device->subgroup_size, 32u);
 
     // mulmat
     std::vector<uint32_t> l_warptile, m_warptile, s_warptile,
@@ -1422,7 +1454,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
 
         l_warptile_mmq = { 128, 128, 128, 32, device->subgroup_size * 2, 64, 2, tm_l, tn_l, tk_l, device->subgroup_size };
         m_warptile_mmq = { 128,  64,  64, 32, device->subgroup_size, 32, 2, tm_m, tn_m, tk_m, device->subgroup_size };
-        s_warptile_mmq = { subgroup_size_16, 32, 32, 32, 32, 32, 2, tm_s, tn_s, tk_s, device->subgroup_size };
+        s_warptile_mmq = { subgroup_size_32, 32, 32, 32, 32, 32, 2, tm_s, tn_s, tk_s, device->subgroup_size };
 
         l_mmq_wg_denoms = l_wg_denoms = {128, 128, 1 };
         m_mmq_wg_denoms = m_wg_denoms = { 64,  64, 1 };
@@ -1496,7 +1528,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
     device->pipeline_matmul_id_f32 = std::make_shared<vk_matmul_pipeline_struct>();
 
     std::vector<std::future<void>> compiles;
-    auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, const std::vector<uint32_t>& specialization_constants, uint32_t align, bool disable_robustness = false) {
+    auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint,
+                                              uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, const std::vector<uint32_t>& specialization_constants,
+                                              uint32_t align, bool disable_robustness = false, bool require_full_subgroups = false, uint32_t required_subgroup_size = 0) {
         {
             // wait until fewer than N compiles are in progress
             uint32_t N = std::max(1u, std::thread::hardware_concurrency());
@@ -1506,7 +1540,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
             }
             compile_count++;
         }
-        compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), name, spv_size, spv_data, entrypoint, parameter_count, push_constant_size, wg_denoms, specialization_constants, align, disable_robustness));
+        compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), name, spv_size, spv_data, entrypoint,
+                                      parameter_count, push_constant_size, wg_denoms, specialization_constants, align, disable_robustness, require_full_subgroups, required_subgroup_size));
     };
 
 #if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
@@ -1612,40 +1647,59 @@ static void ggml_vk_load_shaders(vk_device& device) {
         // Create 6 variants, {s,m,l}x{unaligned,aligned}
 #define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
         if (device->mul_mat ## ID ## _l) \
-            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1);   \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, false, true);   \
         if (device->mul_mat ## ID ## _m) \
-            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1);   \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, false, true);   \
         if (device->mul_mat ## ID ## _s) \
-            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1);   \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, false, true);   \
         if (device->mul_mat ## ID ## _l) \
-            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align);   \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align, false, true);   \
         if (device->mul_mat ## ID ## _m) \
-            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align);   \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align, false, true);   \
         if (device->mul_mat ## ID ## _s) \
-            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align);   \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align, false, true);   \
 
         // Create 2 variants, {f16,f32} accumulator
 #define CREATE_MM2(PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
-        CREATE_MM(PIPELINE_NAME . f16acc, NAMELC, _f16acc, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
-        CREATE_MM(PIPELINE_NAME . f32acc, NAMELC, , WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
+        if (device->coopmat_acc_f16_support) { \
+            CREATE_MM(PIPELINE_NAME . f16acc, NAMELC, _f16acc, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
+        } \
+        if (device->coopmat_acc_f32_support) { \
+            CREATE_MM(PIPELINE_NAME . f32acc, NAMELC, , WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
+        } \
 
         CREATE_MM(pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
         CREATE_MM(pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
         CREATE_MM2(pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
         CREATE_MM2(pipeline_matmul_f16_f32, matmul_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
 
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        if (device->coopmat_acc_f16_support) {
+            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+
+            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        } else {
+            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
 
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        }
 
         // If there's not enough shared memory for row_ids and the result tile, don't create these pipelines.
         if (device->mul_mat_id_s || device->mul_mat_id_m || device->mul_mat_id_l) {
@@ -1653,19 +1707,35 @@ static void ggml_vk_load_shaders(vk_device& device) {
             CREATE_MM2(pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
             CREATE_MM2(pipeline_matmul_id_f16_f32, matmul_id_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
 
-            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-
-            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            if (device->coopmat_acc_f16_support) {
+                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+
+                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            } else {
+                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+
+                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            }
         }
+#undef CREATE_MM2
 #undef CREATE_MM
     } else if (device->fp16) {
         // Create 6 variants, {s,m,l}x{unaligned,aligned}
@@ -1683,6 +1753,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
         if (device->mul_mat ## ID ## _s) \
             ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align);   \
 
+        // Create 2 variants, {f16,f32} accumulator
+#define CREATE_MM2(PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
+        CREATE_MM(PIPELINE_NAME . f16acc, NAMELC, _f16acc, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
+        CREATE_MM(PIPELINE_NAME . f32acc, NAMELC, , WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
+
         CREATE_MM(pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
         CREATE_MM(pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
         CREATE_MM2(pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
@@ -1720,6 +1795,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
             CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
             CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
         }
+#undef CREATE_MM2
 #undef CREATE_MM
     } else {
         // Create 6 variants, {s,m,l}x{unaligned,aligned}
@@ -1774,53 +1850,63 @@ static void ggml_vk_load_shaders(vk_device& device) {
             CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f32acc, matmul_id_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
             CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f32acc, matmul_id_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
         }
-#undef CREATE_MM2
 #undef CREATE_MM
     }
 
     // mul mat vec
-    // computing two rows per workgroup is a benefit for Q4_0 -> Q5_1, but not for Q8_0.
+
+    // the number of rows computed per shader depends on GPU model and quant
+    uint32_t rm_stdq = 1;
+    uint32_t rm_kq = 2;
+    if (device->vendor_id == VK_VENDOR_ID_AMD) {
+        if (device->subgroup_min_size == 64 && device->subgroup_max_size == 64) { // GCN
+            rm_stdq = 2;
+            rm_kq = 4;
+        }
+    } else if (device->vendor_id == VK_VENDOR_ID_INTEL)
+        rm_stdq = 2;
+
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f32_f32",  mul_mat_vec_f32_f32_f32_len,  mul_mat_vec_f32_f32_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32_f32",  mul_mat_vec_f16_f32_f32_len,  mul_mat_vec_f16_f32_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f32_f32", mul_mat_vec_q4_0_f32_f32_len, mul_mat_vec_q4_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f32_f32", mul_mat_vec_q4_1_f32_f32_len, mul_mat_vec_q4_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32_f32", mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32_f32", mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32_f32", mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f32_f32", mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f32_f32", mul_mat_vec_q4_0_f32_f32_len, mul_mat_vec_q4_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f32_f32", mul_mat_vec_q4_1_f32_f32_len, mul_mat_vec_q4_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32_f32", mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32_f32", mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32_f32", mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1*rm_stdq, 1, 1}, {device->subgroup_size, 1*rm_stdq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f32_f32", mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq}, 1, true);
 
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32",  mul_mat_vec_f32_f16_f32_len,  mul_mat_vec_f32_f16_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32",  mul_mat_vec_f16_f16_f32_len,  mul_mat_vec_f16_f16_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f16_f32", mul_mat_vec_q4_0_f16_f32_len, mul_mat_vec_q4_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f16_f32", mul_mat_vec_q4_1_f16_f32_len, mul_mat_vec_q4_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f16_f32", mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f16_f32", mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f16_f32", mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f16_f32", mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f16_f32", mul_mat_vec_q4_0_f16_f32_len, mul_mat_vec_q4_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f16_f32", mul_mat_vec_q4_1_f16_f32_len, mul_mat_vec_q4_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f16_f32", mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f16_f32", mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f16_f32", mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1*rm_stdq, 1, 1}, {device->subgroup_size, 1*rm_stdq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f16_f32", mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq}, 1, true);
 
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32",  mul_mat_vec_id_f32_f32_len,  mul_mat_vec_id_f32_f32_data,  "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32",  mul_mat_vec_id_f16_f32_len,  mul_mat_vec_id_f16_f32_data,  "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32", mul_mat_vec_id_q4_0_f32_len, mul_mat_vec_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_f32", mul_mat_vec_id_q4_1_f32_len, mul_mat_vec_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32", mul_mat_vec_id_q4_0_f32_len, mul_mat_vec_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_f32", mul_mat_vec_id_q4_1_f32_len, mul_mat_vec_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq, 1, 1}, {device->subgroup_size, 1*rm_stdq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq}, 1, true);
 
     // dequant shaders
     ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16",   dequant_f32_len,  dequant_f32_data,  "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
@@ -1942,6 +2028,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
 
     ggml_vk_create_pipeline(device, device->pipeline_pool2d_f32, "pool2d_f32", pool2d_f32_len, pool2d_f32_data, "main", 2, sizeof(vk_op_pool2d_push_constants), {512, 1, 1}, {}, 1);
 
+    ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv6_f32, "rwkv_wkv6_f32", rwkv_wkv6_f32_len, rwkv_wkv6_f32_data, "main", 7, sizeof(vk_op_rwkv_wkv6_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
+
     for (auto &c : compiles) {
         c.wait();
     }
@@ -1998,6 +2086,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
                 amd_shader_core_properties2 = true;
             } else if (strcmp("VK_EXT_pipeline_robustness", properties.extensionName) == 0) {
                 pipeline_robustness = true;
+            } else if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) {
+                device->subgroup_size_control = true;
             } else if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0 &&
                        !getenv("GGML_VK_DISABLE_COOPMAT")) {
                 device->coopmat_support = true;
@@ -2018,6 +2108,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
         vk::PhysicalDeviceShaderSMBuiltinsPropertiesNV sm_props;
         vk::PhysicalDeviceShaderCoreProperties2AMD amd_shader_core_properties2_props;
         vk::PhysicalDeviceVulkan12Properties vk12_props;
+        vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props;
+
         props2.pNext = &props3;
         props3.pNext = &subgroup_props;
         subgroup_props.pNext = &driver_props;
@@ -2037,6 +2129,10 @@ static vk_device ggml_vk_get_device(size_t idx) {
             last_struct->pNext = (VkBaseOutStructure *)&amd_shader_core_properties2_props;
             last_struct = (VkBaseOutStructure *)&amd_shader_core_properties2_props;
         }
+        if (device->subgroup_size_control) {
+            last_struct->pNext = (VkBaseOutStructure *)&subgroup_size_control_props;
+            last_struct = (VkBaseOutStructure *)&subgroup_size_control_props;
+        }
 
 #if defined(VK_NV_cooperative_matrix2)
         vk::PhysicalDeviceCooperativeMatrix2PropertiesNV coopmat2_props;
@@ -2075,7 +2171,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
 
         device->fp16 = !force_disable_f16 && fp16_storage && fp16_compute;
 
-        if (device->vendor_id == VK_VENDOR_ID_INTEL || (props2.properties.vendorID == VK_VENDOR_ID_AMD && driver_props.driverID == vk::DriverId::eAmdProprietary)) {
+        if (device->vendor_id == VK_VENDOR_ID_INTEL || (device->vendor_id == VK_VENDOR_ID_AMD && (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource))) {
             // Intel drivers don't support coopmat properly yet
             // Only RADV supports coopmat properly on AMD
             device->coopmat_support = false;
@@ -2131,6 +2227,17 @@ static vk_device ggml_vk_get_device(size_t idx) {
             device_extensions.push_back("VK_EXT_pipeline_robustness");
         }
 
+        VkPhysicalDeviceSubgroupSizeControlFeaturesEXT subgroup_size_control_features;
+        subgroup_size_control_features.pNext = nullptr;
+        subgroup_size_control_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_FEATURES_EXT;
+        subgroup_size_control_features.computeFullSubgroups = false;
+        subgroup_size_control_features.subgroupSizeControl = false;
+
+        if (device->subgroup_size_control) {
+            last_struct->pNext = (VkBaseOutStructure *)&subgroup_size_control_features;
+            last_struct = (VkBaseOutStructure *)&subgroup_size_control_features;
+        }
+
         VkPhysicalDeviceCooperativeMatrixFeaturesKHR coopmat_features;
         coopmat_features.pNext = nullptr;
         coopmat_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR;
@@ -2158,6 +2265,20 @@ static vk_device ggml_vk_get_device(size_t idx) {
 
         device->pipeline_robustness = pl_robustness_features.pipelineRobustness;
 
+        if (device->subgroup_size_control) {
+            device->subgroup_min_size = subgroup_size_control_props.minSubgroupSize;
+            device->subgroup_max_size = subgroup_size_control_props.maxSubgroupSize;
+        }
+
+        device->subgroup_size_control = device->subgroup_size_control &&
+                (subgroup_size_control_props.requiredSubgroupSizeStages & vk::ShaderStageFlagBits::eCompute) &&
+                subgroup_size_control_features.subgroupSizeControl;
+
+        if (device->subgroup_size_control) {
+            device->subgroup_require_full_support = subgroup_size_control_features.computeFullSubgroups;
+            device_extensions.push_back("VK_EXT_subgroup_size_control");
+        }
+
         device->coopmat_support = device->coopmat_support && coopmat_features.cooperativeMatrix;
 
         if (coopmat2_support) {
@@ -2307,7 +2428,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
                 }
             }
 
-            if (device->coopmat_m == 0) {
+            if (device->coopmat_m == 0 || !device->coopmat_acc_f32_support) {
                 // No suitable matmul mode found
                 GGML_LOG_DEBUG("ggml_vulkan: WARNING: No suitable matrix core mode found. Disabling matrix cores.\n");
                 device->coopmat_support = false;
@@ -2440,7 +2561,7 @@ static void ggml_vk_print_gpu_info(size_t idx) {
         }
     }
 
-    if (props2.properties.vendorID == VK_VENDOR_ID_INTEL || (props2.properties.vendorID == VK_VENDOR_ID_AMD && driver_props.driverID == vk::DriverId::eAmdProprietary)) {
+    if (props2.properties.vendorID == VK_VENDOR_ID_INTEL || (props2.properties.vendorID == VK_VENDOR_ID_AMD && (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource))) {
         // Intel drivers don't support coopmat properly yet
         // Only RADV supports coopmat properly on AMD
         coopmat_support = false;
@@ -2727,7 +2848,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
     if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
         return ctx->device->pipeline_matmul_f32_f16;
     }
-    if (prec == GGML_PREC_DEFAULT && ctx->device->fp16) {
+    if (prec == GGML_PREC_DEFAULT && ctx->device->fp16 && !(ctx->device->coopmat_support && !ctx->device->coopmat_acc_f16_support)) {
         if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
             return ctx->device->pipeline_matmul_f16_f32.f16acc;
         }
@@ -2802,7 +2923,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_co
     if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
         return ctx->device->pipeline_matmul_id_f32;
     }
-    if (prec == GGML_PREC_DEFAULT && ctx->device->fp16) {
+    if (prec == GGML_PREC_DEFAULT && ctx->device->fp16 && !(ctx->device->coopmat_support && !ctx->device->coopmat_acc_f16_support)) {
         if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
             return ctx->device->pipeline_matmul_id_f16_f32.f16acc;
         }
@@ -3089,8 +3210,8 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
         GGML_ABORT("fatal error");
     }
     // Check if src is pinned memory
-    vk_buffer buf;
-    size_t buf_offset;
+    vk_buffer buf = nullptr;
+    size_t buf_offset = 0;
     ggml_vk_host_get(ctx->device, tensor->data, buf, buf_offset);
 
     const uint64_t ne0 = tensor->ne[0];
@@ -3153,7 +3274,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
     VkBufferCopy buf_copy{ 0, offset, copy_size };
 
     ggml_vk_sync_buffers(subctx);
-    vkCmdCopyBuffer(subctx->s->buffer, staging->buffer, dst->buffer, 1, &buf_copy);
+    vkCmdCopyBuffer(subctx->s->buffer, (VkBuffer)staging->buffer, (VkBuffer)dst->buffer, 1, &buf_copy);
 
     for (uint64_t i3 = 0; i3 < ne3; i3++) {
         for (uint64_t i2 = 0; i2 < ne2; i2++) {
@@ -3186,7 +3307,7 @@ static void ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, siz
     }
     // Check if src is pinned memory
     vk_buffer buf = nullptr;
-    size_t buf_offset;
+    size_t buf_offset = 0;
     ggml_vk_host_get(dst->device, src, buf, buf_offset);
 
     if (buf != nullptr) {
@@ -3228,7 +3349,7 @@ static void ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, siz
         copy_size};
 
     ggml_vk_sync_buffers(subctx);
-    vkCmdCopyBuffer(subctx->s->buffer, staging_buffer->buffer, dst->buffer, 1, &buf_copy);
+    vkCmdCopyBuffer(subctx->s->buffer, (VkBuffer)staging_buffer->buffer, (VkBuffer)dst->buffer, 1, &buf_copy);
 
     if (width == spitch) {
         deferred_memcpy((uint8_t *)staging_buffer->ptr, src, width * height, &subctx->in_memcpys);
@@ -3284,7 +3405,7 @@ static void ggml_vk_buffer_read_2d_async(vk_context subctx, vk_buffer& src, size
 
     // Check if dst is pinned memory
     vk_buffer buf = nullptr;
-    size_t buf_offset;
+    size_t buf_offset = 0;
     ggml_vk_host_get(src->device, dst, buf, buf_offset);
 
     std::vector<vk::BufferCopy> slices(1);
@@ -3364,7 +3485,7 @@ static void ggml_vk_buffer_copy_async(vk_context& ctx, vk_buffer& dst, size_t ds
 
     VkBufferCopy bc{ src_offset, dst_offset, size };
 
-    vkCmdCopyBuffer(ctx->s->buffer, src->buffer, dst->buffer, 1, &bc);
+    vkCmdCopyBuffer(ctx->s->buffer, (VkBuffer)src->buffer, (VkBuffer)dst->buffer, 1, &bc);
 }
 
 static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
@@ -3616,9 +3737,9 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
     ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
     ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
 
-    vk_buffer d_Qx;
+    vk_buffer d_Qx = nullptr;
     size_t qx_buf_offset = 0;
-    vk_buffer d_Qy;
+    vk_buffer d_Qy = nullptr;
     size_t qy_buf_offset = 0;
 
     bool src0_uma = false;
@@ -3818,9 +3939,9 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
     ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
     ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
 
-    vk_buffer d_Qx;
+    vk_buffer d_Qx = nullptr;
     size_t qx_buf_offset = 0;
-    vk_buffer d_Qy;
+    vk_buffer d_Qy = nullptr;
     size_t qy_buf_offset = 0;
 
     bool src0_uma = false;
@@ -3996,7 +4117,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
     ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
     ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
 
-    vk_buffer d_Qy;
+    vk_buffer d_Qy = nullptr;
     size_t qy_buf_offset = 0;
 
     bool src1_uma = false;
@@ -4184,11 +4305,11 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
     ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
     ggml_backend_vk_buffer_context * ids_buf_ctx = (ggml_backend_vk_buffer_context *)ids->buffer->context;
 
-    vk_buffer d_Qx;
+    vk_buffer d_Qx = nullptr;
     size_t qx_buf_offset = 0;
-    vk_buffer d_Qy;
+    vk_buffer d_Qy = nullptr;
     size_t qy_buf_offset = 0;
-    vk_buffer d_ids;
+    vk_buffer d_ids = nullptr;
     size_t ids_buf_offset = 0;
 
     bool src0_uma = false;
@@ -4389,11 +4510,11 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
     ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
     ggml_backend_vk_buffer_context * ids_buf_ctx = (ggml_backend_vk_buffer_context *)ids->buffer->context;
 
-    vk_buffer d_Qx;
+    vk_buffer d_Qx = nullptr;
     size_t qx_buf_offset = 0;
-    vk_buffer d_Qy;
+    vk_buffer d_Qy = nullptr;
     size_t qy_buf_offset = 0;
-    vk_buffer d_ids;
+    vk_buffer d_ids = nullptr;
     size_t ids_buf_offset = 0;
 
     bool src0_uma = false;
@@ -4652,8 +4773,8 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
 
     ggml_vk_sync_buffers(subctx);
 
-    vk_buffer d_Q, d_K, d_V, d_D, d_M;
-    uint64_t q_buf_offset, k_buf_offset, v_buf_offset, d_buf_offset, m_buf_offset;
+    vk_buffer d_Q = nullptr, d_K = nullptr, d_V = nullptr, d_D = nullptr, d_M = nullptr;
+    size_t q_buf_offset = 0, k_buf_offset = 0, v_buf_offset = 0, d_buf_offset = 0, m_buf_offset = 0;
 
     bool Q_uma = false, K_uma = false, V_uma = false, D_uma = false, M_uma = false;
 
@@ -4917,6 +5038,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
             return ctx->device->pipeline_pool2d_f32;
         }
         return nullptr;
+    case GGML_OP_RWKV_WKV6:
+        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+            return ctx->device->pipeline_rwkv_wkv6_f32;
+        }
+        return nullptr;
     case GGML_OP_LEAKY_RELU:
         if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
             return ctx->device->pipeline_leaky_relu_f32;
@@ -5319,6 +5445,134 @@ static void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context& subctx, const
     }, dryrun);
 }
 
+static void ggml_vk_op_f32_rwkv6(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, const vk_op_rwkv_wkv6_push_constants&& pc, bool dryrun = false) {
+    const ggml_tensor * k = dst->src[0];
+    const ggml_tensor * v = dst->src[1];
+    const ggml_tensor * r = dst->src[2];
+    const ggml_tensor * tf = dst->src[3];
+    const ggml_tensor * td = dst->src[4];
+    const ggml_tensor * state = dst->src[5];
+
+    GGML_ASSERT(!ggml_is_quantized(k->type));
+    GGML_ASSERT(!ggml_is_quantized(v->type));
+    GGML_ASSERT(!ggml_is_quantized(r->type));
+    GGML_ASSERT(!ggml_is_quantized(tf->type));
+    GGML_ASSERT(!ggml_is_quantized(td->type));
+    GGML_ASSERT(!ggml_is_quantized(state->type));
+    GGML_ASSERT(dst->buffer != nullptr);
+
+    vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, k, v, r, dst, GGML_OP_RWKV_WKV6);
+    GGML_ASSERT(pipeline != nullptr);
+
+    if (dryrun) {
+        ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
+        return;
+    }
+
+    ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
+    ggml_backend_vk_buffer_context * k_buf_ctx = (ggml_backend_vk_buffer_context *)k->buffer->context;
+    ggml_backend_vk_buffer_context * v_buf_ctx = (ggml_backend_vk_buffer_context *)v->buffer->context;
+    ggml_backend_vk_buffer_context * r_buf_ctx = (ggml_backend_vk_buffer_context *)r->buffer->context;
+    ggml_backend_vk_buffer_context * tf_buf_ctx = (ggml_backend_vk_buffer_context *)tf->buffer->context;
+    ggml_backend_vk_buffer_context * td_buf_ctx = (ggml_backend_vk_buffer_context *)td->buffer->context;
+    ggml_backend_vk_buffer_context * state_buf_ctx = (ggml_backend_vk_buffer_context *)state->buffer->context;
+
+    ggml_vk_sync_buffers(subctx);
+
+    vk_buffer d_D = nullptr, d_K = nullptr, d_V = nullptr, d_R = nullptr, d_TF = nullptr, d_TD = nullptr, d_State = nullptr;
+    size_t k_offset = 0, v_offset = 0, r_offset = 0, tf_offset = 0, td_offset = 0, state_offset = 0, dst_offset = 0;
+    bool K_uma = false, V_uma = false, R_uma = false, TF_uma = false, TD_uma = false, STATE_uma = false, DST_uma = false;
+
+    if (ctx->device->uma) {
+        ggml_vk_host_get(ctx->device, k->data, d_K, k_offset);
+        ggml_vk_host_get(ctx->device, v->data, d_V, v_offset);
+        ggml_vk_host_get(ctx->device, r->data, d_R, r_offset);
+        ggml_vk_host_get(ctx->device, tf->data, d_TF, tf_offset);
+        ggml_vk_host_get(ctx->device, td->data, d_TD, td_offset);
+        ggml_vk_host_get(ctx->device, state->data, d_State, state_offset);
+        ggml_vk_host_get(ctx->device, dst->data, d_D, dst_offset);
+
+        K_uma = d_K != nullptr;
+        V_uma = d_V != nullptr;
+        R_uma = d_R != nullptr;
+        TF_uma = d_TF != nullptr;
+        TD_uma = d_TD != nullptr;
+        STATE_uma = d_State != nullptr;
+        DST_uma = d_D != nullptr;
+    }
+
+    if (!K_uma) {
+        d_K = k_buf_ctx->dev_buffer;
+        k_offset = vk_tensor_offset(k) + k->view_offs;
+    }
+    if (!V_uma) {
+        d_V = v_buf_ctx->dev_buffer;
+        v_offset = vk_tensor_offset(v) + v->view_offs;
+    }
+    if (!R_uma) {
+        d_R = r_buf_ctx->dev_buffer;
+        r_offset = vk_tensor_offset(r) + r->view_offs;
+    }
+    if (!TF_uma) {
+        d_TF = tf_buf_ctx->dev_buffer;
+        tf_offset = vk_tensor_offset(tf) + tf->view_offs;
+    }
+    if (!TD_uma) {
+        d_TD = td_buf_ctx->dev_buffer;
+        td_offset = vk_tensor_offset(td) + td->view_offs;
+    }
+    if (!STATE_uma) {
+        d_State = state_buf_ctx->dev_buffer;
+        state_offset = vk_tensor_offset(state) + state->view_offs;
+    }
+    if (!DST_uma) {
+        d_D = dst_buf_ctx->dev_buffer;
+        dst_offset = vk_tensor_offset(dst) + dst->view_offs;
+    }
+
+    const uint64_t k_size = ggml_nbytes(k);
+    const uint64_t v_size = ggml_nbytes(v);
+    const uint64_t r_size = ggml_nbytes(r);
+    const uint64_t tf_size = ggml_nbytes(tf);
+    const uint64_t td_size = ggml_nbytes(td);
+    const uint64_t state_size = ggml_nbytes(state);
+    const uint64_t dst_size = ggml_nbytes(dst);
+
+    std::array<uint32_t, 3> elements = {
+        (uint32_t)(pc.B * pc.H),
+        1,
+        1
+    };
+
+    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, {
+        vk_subbuffer{ d_K, k_offset, k_size },
+        vk_subbuffer{ d_V, v_offset, v_size },
+        vk_subbuffer{ d_R, r_offset, r_size },
+        vk_subbuffer{ d_TF, tf_offset, tf_size },
+        vk_subbuffer{ d_TD, td_offset, td_size },
+        vk_subbuffer{ d_State, state_offset, state_size },
+        vk_subbuffer{ d_D, dst_offset, dst_size }
+    }, sizeof(vk_op_rwkv_wkv6_push_constants), &pc, elements);
+}
+
+static void ggml_vk_rwkv_wkv6(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, bool dryrun = false) {
+    const size_t seq_length = dst->src[0]->ne[3];
+    const size_t n_embed = dst->ne[0];
+    const size_t n_heads = dst->src[0]->ne[2];
+    const size_t n_seqs = dst->src[5]->ne[1];
+
+    ggml_vk_op_f32_rwkv6(
+        ctx, subctx, dst,
+        {
+            (uint32_t)n_seqs,
+            (uint32_t)seq_length,
+            (uint32_t)n_embed,
+            (uint32_t)n_heads,
+        },
+        dryrun
+    );
+}
+
 static void ggml_vk_concat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
     int * op_params = (int *)dst->op_params;
 
@@ -6464,6 +6718,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
     case GGML_OP_IM2COL:
     case GGML_OP_TIMESTEP_EMBEDDING:
     case GGML_OP_POOL_2D:
+    case GGML_OP_RWKV_WKV6:
     case GGML_OP_LEAKY_RELU:
     case GGML_OP_FLASH_ATTN_EXT:
         break;
@@ -6663,6 +6918,11 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
     case GGML_OP_FLASH_ATTN_EXT:
         ggml_vk_flash_attn(ctx, compute_ctx, src0, src1, src2, src3, node, dryrun);
 
+        break;
+
+    case GGML_OP_RWKV_WKV6:
+        ggml_vk_rwkv_wkv6(ctx, compute_ctx, node, dryrun);
+
         break;
     default:
         return false;
@@ -6743,6 +7003,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
     case GGML_OP_IM2COL:
     case GGML_OP_TIMESTEP_EMBEDDING:
     case GGML_OP_POOL_2D:
+    case GGML_OP_RWKV_WKV6:
     case GGML_OP_LEAKY_RELU:
     case GGML_OP_REPEAT:
         buf = tensor->buffer;
@@ -7582,7 +7843,16 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
         case GGML_OP_REPEAT:
             return ggml_type_size(op->type) == sizeof(float) && ggml_type_size(op->src[0]->type) == sizeof(float);
         case GGML_OP_ROPE:
-            return ggml_is_contiguous(op->src[0]);
+            {
+                const int mode = ((const int32_t *) op->op_params)[2];
+                if (mode & GGML_ROPE_TYPE_MROPE) {
+                    return false;
+                }
+                if (mode & GGML_ROPE_TYPE_VISION) {
+                    return false;
+                }
+                return ggml_is_contiguous(op->src[0]);
+            }
         case GGML_OP_NONE:
         case GGML_OP_RESHAPE:
         case GGML_OP_VIEW:
@@ -7610,6 +7880,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
         case GGML_OP_IM2COL:
         case GGML_OP_TIMESTEP_EMBEDDING:
         case GGML_OP_POOL_2D:
+        case GGML_OP_RWKV_WKV6:
         case GGML_OP_LEAKY_RELU:
             return true;
         default:
@@ -8186,7 +8457,11 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
     } else if (tensor->op == GGML_OP_LEAKY_RELU) {
         const float * op_params = (const float *)tensor->op_params;
         tensor_clone = ggml_leaky_relu(ggml_ctx, src0_clone, op_params[0], false);
-    } else {
+    } else if (tensor->op == GGML_OP_RWKV_WKV6) {
+        tensor_clone = ggml_rwkv_wkv6(ggml_ctx, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3],
+        tensor->src[4], tensor->src[5]);
+    }
+    else {
         std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
         GGML_ABORT("fatal error");
     }
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp
index 5fc1ba4ad36..91bb8f8db61 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp
@@ -25,92 +25,94 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) {
 
 #if defined(DATA_A_Q4_0)
 vec2 dequantize(uint ib, uint iqs, uint a_offset) {
-    const float d = float(data_a[a_offset + ib].d);
     const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
-    return (vec2(vui & 0xF, vui >> 4) - 8.0f) * d;
+    return (vec2(vui & 0xF, vui >> 4) - 8.0f);
 }
 vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
-    const float d = float(data_a_packed16[a_offset + ib].d);
     const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
-    return (vec4(vui & 0xF, (vui >> 4) & 0xF, (vui >> 8) & 0xF, (vui >> 12) & 0xF) - 8.0f) * d;
+    return (vec4(vui & 0xF, (vui >> 4) & 0xF, (vui >> 8) & 0xF, vui >> 12) - 8.0f);
 }
 #endif
 
 #if defined(DATA_A_Q4_1)
 vec2 dequantize(uint ib, uint iqs, uint a_offset) {
-    const float d = float(data_a[a_offset + ib].d);
-    const float m = float(data_a[a_offset + ib].m);
     const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
-    return vec2(vui & 0xF, vui >> 4) * d + m;
+    return vec2(vui & 0xF, vui >> 4);
 }
 vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
-    const float d = float(data_a_packed16[a_offset + ib].d);
-    const float m = float(data_a_packed16[a_offset + ib].m);
     const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
-    return vec4(vui & 0xF, (vui >> 4) & 0xF, (vui >> 8) & 0xF, (vui >> 12) & 0xF) * d + m;
+    return vec4(vui & 0xF, (vui >> 4) & 0xF, (vui >> 8) & 0xF, vui >> 12);
 }
 #endif
 
 #if defined(DATA_A_Q5_0)
 vec2 dequantize(uint ib, uint iqs, uint a_offset) {
-    const float d = float(data_a[a_offset + ib].d);
     const uint uint_qh = uint(data_a[a_offset + ib].qh[1]) << 16 | data_a[a_offset + ib].qh[0];
     const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
     const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
-    return (vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y) - 16.0f) * d;
+    return (vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y) - 16.0f);
 }
 vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
-    const float d = float(data_a_packed16[a_offset + ib].d);
     const uint uint_qh = uint(data_a_packed16[a_offset + ib].qh[1]) << 16 | data_a_packed16[a_offset + ib].qh[0];
     const ivec2 qh0 = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
     const ivec2 qh1 = ivec2(((uint_qh >> (iqs + 1)) << 4) & 0x10, (uint_qh >> (iqs + 13)) & 0x10);
     const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
-    return (vec4(((vui >> 0) & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, ((vui >> 12) & 0xF) | qh1.y) - 16.0f) * d;
+    return (vec4((vui & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, (vui >> 12) | qh1.y) - 16.0f);
 }
 #endif
 
 #if defined(DATA_A_Q5_1)
 vec2 dequantize(uint ib, uint iqs, uint a_offset) {
-    const float d = float(data_a[a_offset + ib].d);
-    const float m = float(data_a[a_offset + ib].m);
     const uint uint_qh = data_a[a_offset + ib].qh;
     const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
     const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
-    return vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y) * d + m;
+    return vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y);
 }
 vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
-    const float d = float(data_a_packed16[a_offset + ib].d);
-    const float m = float(data_a_packed16[a_offset + ib].m);
     const uint uint_qh = data_a_packed16[a_offset + ib].qh;
     const ivec2 qh0 = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
     const ivec2 qh1 = ivec2(((uint_qh >> (iqs + 1)) << 4) & 0x10, (uint_qh >> (iqs + 13)) & 0x10);
     const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
-    return vec4(((vui >> 0) & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, ((vui >> 12) & 0xF) | qh1.y) * d + m;
+    return vec4((vui & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, (vui >> 12) | qh1.y);
 }
 #endif
 
 #if defined(DATA_A_Q8_0)
 vec2 dequantize(uint ib, uint iqs, uint a_offset) {
-    const float d = float(data_a[a_offset + ib].d);
-    return vec2(int(data_a[a_offset + ib].qs[iqs]), int(data_a[a_offset + ib].qs[iqs + 1])) * d;
+    return vec2(int(data_a[a_offset + ib].qs[iqs]), int(data_a[a_offset + ib].qs[iqs + 1]));
 }
 vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
-    const float d = float(data_a_packed16[a_offset + ib].d);
     uint32_t v0 = data_a_packed16[a_offset + ib].qs[iqs/2];
     uint32_t v1 = data_a_packed16[a_offset + ib].qs[iqs/2 + 1];
-    return vec4(int8_t(v0 & 0xFF), int8_t((v0 >> 8) & 0xFF), int8_t(v1 & 0xFF), int8_t((v1 >> 8) & 0xFF)) * d;
+    return vec4(int8_t(v0 & 0xFF), int8_t(v0 >> 8), int8_t(v1 & 0xFF), int8_t(v1 >> 8));
 }
 #endif
 
 #if defined(DATA_A_IQ4_NL)
 vec2 dequantize(uint ib, uint iqs, uint a_offset) {
-    const float d = float(data_a[a_offset + ib].d);
     const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
-    return vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]) * d;
+    return vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]);
 }
 vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
-    const float d = float(data_a_packed16[a_offset + ib].d);
     const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
-    return vec4(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[(vui >> 4) & 0xF], kvalues_iq4nl[(vui >> 8) & 0xF], kvalues_iq4nl[(vui >> 12) & 0xF]) * d;
+    return vec4(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[(vui >> 4) & 0xF], kvalues_iq4nl[(vui >> 8) & 0xF], kvalues_iq4nl[vui >> 12]);
+}
+#endif
+
+#if defined(DATA_A_F32) || defined(DATA_A_F16)
+vec2 get_dm(uint ib, uint a_offset) {
+    return vec2(0, 0);
+}
+#endif
+
+#if defined(DATA_A_Q4_0) || defined(DATA_A_Q5_0) || defined(DATA_A_Q8_0) || defined(DATA_A_IQ4_NL)
+vec2 get_dm(uint ib, uint a_offset) {
+    return vec2(float(data_a[a_offset + ib].d), 0);
+}
+#endif
+
+#if defined(DATA_A_Q4_1) || defined(DATA_A_Q5_1)
+vec2 get_dm(uint ib, uint a_offset) {
+    return vec2(float(data_a[a_offset + ib].d), float(data_a[a_offset + ib].m));
 }
 #endif
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp
index a8707b621e7..94b78598ea2 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp
@@ -10,9 +10,10 @@ float16_t dequantFuncQ4_0(const in decodeBufQ4_0 bl, const in uint blockCoords[2
     const float16_t d = bl.block.d;
     const uint idx = coordInBlock[1];
     const uint shift = (idx & 0x10) >> 2;
-    uint32_t qs = unpack8(uint32_t(bl.block.qs[(idx & 0xE) >> 1]))[idx & 1];
+    uint32_t qs = uint32_t(bl.block.qs[(idx & 0xE) >> 1]);
     qs >>= shift;
-    qs &= 0xF;
+    qs &= 0x0F0F;
+    qs = unpack8(qs)[idx & 1];
     float16_t ret = (float16_t(qs) - float16_t(8)) * d;
     return ret;
 }
@@ -152,15 +153,17 @@ layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4
    block_q4_K block;
 };
 
+layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4_K_packed16 {
+   block_q4_K_packed16 block;
+};
+
 float16_t dequantFuncQ4_K(const in decodeBufQ4_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
 {
+    decodeBufQ4_K_packed16 bl16 = decodeBufQ4_K_packed16(bl);
     const uint idx = coordInBlock[1];
-    const uint iqs = idx;
 
-    const uint n = iqs / 64;                   // 0,1,2,3
-    const uint b = (iqs % 64) / 32;            // 0,1
+    const uint b = (idx & 0x20) >> 5;            // 0,1
     const uint is = (idx & 0xE0) >> 5;         // 0..7
-    const uint qsi = n * 32 + (iqs % 32);      // 0..127
 
     const f16vec2 loadd = bl.block.d;
 
@@ -184,9 +187,11 @@ float16_t dequantFuncQ4_K(const in decodeBufQ4_K bl, const in uint blockCoords[2
     const float16_t d = loadd.x * float16_t(sc);
     const float16_t m = loadd.y * float16_t(mbyte);
 
-    uint32_t dmask = 0xF << (b * 4);
+    uint qs = uint32_t(bl16.block.qs[((idx & 0xC0) >> 2) + ((idx & 0x1E) >> 1)]);
+    qs = (qs >> (b * 4)) & 0x0F0F;
+    qs = unpack8(qs)[idx & 1];
 
-    float16_t ret = d * float16_t((bl.block.qs[qsi    ] & dmask) >> (b * 4)) - m;
+    float16_t ret = d * float16_t(qs) - m;
 
     return ret;
 }
@@ -195,18 +200,19 @@ layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ5
    block_q5_K block;
 };
 
+layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ5_K_packed16 {
+   block_q5_K_packed16 block;
+};
+
 float16_t dequantFuncQ5_K(const in decodeBufQ5_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
 {
+    decodeBufQ5_K_packed16 bl16 = decodeBufQ5_K_packed16(bl);
     const uint idx = coordInBlock[1];
-    const uint iqs = idx;
 
-    const uint n = iqs / 64;                   // 0,1,2,3
-    const uint b = (iqs % 64) / 32;            // 0,1
+    const uint b = (idx & 0x20) >> 5;          // 0,1
     const uint is = (idx & 0xE0) >> 5;         // 0..7
-    const uint qsi = n * 32 + (iqs % 32);      // 0..127
-    const uint qhi = (iqs % 32);               // 0..31
 
-    const uint8_t hm = uint8_t(1 << (iqs / 32));
+    const uint32_t hm = 0x0101 << is;
 
     const f16vec2 loadd = bl.block.d;
 
@@ -230,9 +236,15 @@ float16_t dequantFuncQ5_K(const in decodeBufQ5_K bl, const in uint blockCoords[2
     const float16_t d = loadd.x * float16_t(sc);
     const float16_t m = loadd.y * float16_t(mbyte);
 
-    uint32_t dmask = 0xF << (b * 4);
+    uint qh = uint32_t(bl16.block.qh[(idx & 0x1E) >> 1]);
+    qh = qh & hm;
+    qh = unpack8(qh)[idx & 1];
 
-    float16_t ret = d * (float16_t((bl.block.qs[qsi    ] & dmask) >> (b * 4)) + float16_t((bl.block.qh[qhi    ] & hm) != 0 ? 16 : 0)) - m;
+    uint qs = uint32_t(bl16.block.qs[((idx & 0xC0) >> 2) + ((idx & 0x1E) >> 1)]);
+    qs = (qs >> (b * 4)) & 0x0F0F;
+    qs = unpack8(qs)[idx & 1];
+
+    float16_t ret = d * (float16_t(qs) + (qh != 0 ? float16_t(16) : float16_t(0))) - m;
 
     return ret;
 }
@@ -241,22 +253,30 @@ layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ6_
    block_q6_K block;
 };
 
+layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ6_K_packed16 {
+   block_q6_K_packed16 block;
+};
+
 float16_t dequantFuncQ6_K(const in decodeBufQ6_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
 {
+    decodeBufQ6_K_packed16 bl16 = decodeBufQ6_K_packed16(bl);
     const uint idx = coordInBlock[1];
-    const uint iqs = idx;
 
-    const uint n = iqs / 128;                   // 0,1
-    const uint b = (iqs % 128) / 64;            // 0,1
-    const uint is_b = (iqs % 32) / 16;          // 0,1
-    const uint qhshift = ((iqs % 128) / 32) * 2;// 0,2,4,6
-    const uint is = 8 * n + qhshift + is_b;     // 0..15
-    const uint qsi = n * 64 + (iqs % 64);       // 0..127
-    const uint qhi = n * 32 + (iqs % 32);       // 0..63
+    const uint b = (idx & 0x40) >> 6;           // 0,1
+    const uint qhshift = (idx & 0x60) >> 4;    // 0,2,4,6
+    const uint is = (idx & 0xF0) >> 4;          // 0..15
 
     const float16_t dscale = bl.block.d * float16_t(bl.block.scales[is]);
 
-    float16_t ret = dscale * float16_t(int8_t(((bl.block.ql[qsi    ] >> (b * 4)) & 0xF) | (((bl.block.qh[qhi    ] >> qhshift) & 3) << 4)) - 32);
+    uint ql = uint32_t(bl16.block.ql[((idx & 0x80) >> 2) + ((idx & 0x3E) >> 1)]);
+    ql = (ql >> (b * 4)) & 0x0F0F;
+
+    uint qh = uint32_t(bl16.block.qh[((idx & 0x80) >> 3) + ((idx & 0x1E) >> 1)]);
+    qh = ((qh >> qhshift) & 0x0303) << 4;
+
+    int q = unpack8(ql | qh)[idx & 1];
+
+    float16_t ret = dscale * float16_t(q - 32);
 
     return ret;
 }
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp
index 92acb75406d..987f113a35a 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp
@@ -9,8 +9,8 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
 
 void main() {
     [[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
-        const uint i = gl_WorkGroupID.x * 256 + wgy;
-        if (i >= p.M * p.K / QUANT_K) {
+        const uint ib = gl_WorkGroupID.x * 256 + wgy;
+        if (ib >= p.M * p.K / QUANT_K) {
             return;
         }
 
@@ -20,37 +20,49 @@ void main() {
         const uint is = 2 * il;
         const uint n = 4;
 
-        const FLOAT_TYPE dall = FLOAT_TYPE(data_a[i].d.x);
-        const FLOAT_TYPE dmin = FLOAT_TYPE(data_a[i].d.y);
+        const FLOAT_TYPE dall = FLOAT_TYPE(data_a[ib].d.x);
+        const FLOAT_TYPE dmin = FLOAT_TYPE(data_a[ib].d.y);
 
-        const uint y_idx = i * QUANT_K + 64 * il + n * ir;
+        const uint y_idx = ib * QUANT_K + 64 * il + n * ir;
         const uint qs_idx = 32*il + n * ir;
 
-        uint8_t sc;
-        uint8_t m;
-        if (is < 4) {
-            sc = uint8_t(data_a[i].scales[is] & 63);
-            m  = uint8_t(data_a[i].scales[is + 4] & 63);
-        } else {
-            sc = uint8_t((data_a[i].scales[is + 4] & 0xF) | ((data_a[i].scales[is - 4] >> 6) << 4));
-            m  = uint8_t((data_a[i].scales[is + 4] >>  4) | ((data_a[i].scales[is    ] >> 6) << 4));
-        }
+        uint scidx0 = (is < 4) ? is : (is + 4);
+        uint scidx1 = (is < 4) ? is : (is - 4);
+        uint scidxmask1 = (is < 4) ? 0x30 : 0xC0;
+        uint scidxshift1 = (is < 4) ? 0 : 2;
+        uint mbidx0 = is + 4;
+        uint mbidx1 = (is < 4) ? is + 4 : is;
+        uint mbidxmask0 = (is < 4) ? 0xF : 0xF0;
+        uint mbidxshift0 = (is < 4) ? 0 : 4;
+        uint mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
+        uint mbidxshift1 = (is < 4) ? 0 : 2;
+
+        uint8_t sc = uint8_t((data_a[ib].scales[scidx0] & 0xF) | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1));
+        uint8_t mbyte = uint8_t((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0 | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1));
+
         const FLOAT_TYPE d1 = dall * sc;
-        const FLOAT_TYPE m1 = dmin * m;
-
-        if (is < 4) {
-            sc = uint8_t(data_a[i].scales[is + 1] & 63);
-            m  = uint8_t(data_a[i].scales[is + 5] & 63);
-        } else {
-            sc = uint8_t((data_a[i].scales[is + 5] & 0xF) | ((data_a[i].scales[is - 3] >> 6) << 4));
-            m  = uint8_t((data_a[i].scales[is + 5] >>  4) | ((data_a[i].scales[is + 1] >> 6) << 4));
-        }
+        const FLOAT_TYPE m1 = dmin * mbyte;
+
+        scidx0 = (is < 4) ? is + 1 : (is + 5);
+        scidx1 = (is < 4) ? is + 1 : (is - 3);
+        scidxmask1 = (is < 4) ? 0x30 : 0xC0;
+        scidxshift1 = (is < 4) ? 0 : 2;
+        mbidx0 = is + 5;
+        mbidx1 = (is < 4) ? is + 5 : is + 1;
+        mbidxmask0 = (is < 4) ? 0xF : 0xF0;
+        mbidxshift0 = (is < 4) ? 0 : 4;
+        mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
+        mbidxshift1 = (is < 4) ? 0 : 2;
+
+        sc = uint8_t((data_a[ib].scales[scidx0] & 0xF) | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1));
+        mbyte = uint8_t((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0 | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1));
+
         const FLOAT_TYPE d2 = dall * sc;
-        const FLOAT_TYPE m2 = dmin * m;
+        const FLOAT_TYPE m2 = dmin * mbyte;
 
         [[unroll]] for (uint l = 0; l < n; ++l) {
-            data_b[y_idx + l     ] = D_TYPE(d1 * FLOAT_TYPE(data_a[i].qs[qs_idx + l] & 0xF) - m1);
-            data_b[y_idx + l + 32] = D_TYPE(d2 * FLOAT_TYPE(data_a[i].qs[qs_idx + l] >>  4) - m2);
+            data_b[y_idx + l     ] = D_TYPE(d1 * FLOAT_TYPE(data_a[ib].qs[qs_idx + l] & 0xF) - m1);
+            data_b[y_idx + l + 32] = D_TYPE(d2 * FLOAT_TYPE(data_a[ib].qs[qs_idx + l] >>  4) - m2);
         }
     }
 }
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp
index f314a76d105..6db5403b661 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp
@@ -9,8 +9,8 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
 
 void main() {
     [[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
-        const uint i = gl_WorkGroupID.x * 256 + wgy;
-        if (i >= p.M * p.K / QUANT_K) {
+        const uint ib = gl_WorkGroupID.x * 256 + wgy;
+        if (ib >= p.M * p.K / QUANT_K) {
             return;
         }
 
@@ -19,40 +19,52 @@ void main() {
         const uint ir = tid % 16;
         const uint is = 2 * il;
 
-        const FLOAT_TYPE dall = FLOAT_TYPE(data_a[i].d.x);
-        const FLOAT_TYPE dmin = FLOAT_TYPE(data_a[i].d.y);
+        const FLOAT_TYPE dall = FLOAT_TYPE(data_a[ib].d.x);
+        const FLOAT_TYPE dmin = FLOAT_TYPE(data_a[ib].d.y);
 
-        const uint y_idx = i * QUANT_K + 64 * il + 2 * ir;
+        const uint y_idx = ib * QUANT_K + 64 * il + 2 * ir;
         const uint qs_idx = 32*il + 2 * ir;
         const uint qh_idx = 2 * ir;
 
-        uint8_t sc;
-        uint8_t m;
-        if (is < 4) {
-            sc = uint8_t(data_a[i].scales[is] & 63);
-            m  = uint8_t(data_a[i].scales[is + 4] & 63);
-        } else {
-            sc = uint8_t((data_a[i].scales[is + 4] & 0xF) | ((data_a[i].scales[is - 4] >> 6) << 4));
-            m  = uint8_t((data_a[i].scales[is + 4] >>  4) | ((data_a[i].scales[is    ] >> 6) << 4));
-        }
+        uint scidx0 = (is < 4) ? is : (is + 4);
+        uint scidx1 = (is < 4) ? is : (is - 4);
+        uint scidxmask1 = (is < 4) ? 0x30 : 0xC0;
+        uint scidxshift1 = (is < 4) ? 0 : 2;
+        uint mbidx0 = is + 4;
+        uint mbidx1 = (is < 4) ? is + 4 : is;
+        uint mbidxmask0 = (is < 4) ? 0xF : 0xF0;
+        uint mbidxshift0 = (is < 4) ? 0 : 4;
+        uint mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
+        uint mbidxshift1 = (is < 4) ? 0 : 2;
+
+        uint8_t sc = uint8_t((data_a[ib].scales[scidx0] & 0xF) | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1));
+        uint8_t mbyte = uint8_t((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0 | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1));
+
         const FLOAT_TYPE d1 = dall * sc;
-        const FLOAT_TYPE m1 = dmin * m;
-
-        if (is < 4) {
-            sc = uint8_t(data_a[i].scales[is + 1] & 63);
-            m  = uint8_t(data_a[i].scales[is + 5] & 63);
-        } else {
-            sc = uint8_t((data_a[i].scales[is + 5] & 0xF) | ((data_a[i].scales[is - 3] >> 6) << 4));
-            m  = uint8_t((data_a[i].scales[is + 5] >>  4) | ((data_a[i].scales[is + 1] >> 6) << 4));
-        }
+        const FLOAT_TYPE m1 = dmin * mbyte;
+
+        scidx0 = (is < 4) ? is + 1 : (is + 5);
+        scidx1 = (is < 4) ? is + 1 : (is - 3);
+        scidxmask1 = (is < 4) ? 0x30 : 0xC0;
+        scidxshift1 = (is < 4) ? 0 : 2;
+        mbidx0 = is + 5;
+        mbidx1 = (is < 4) ? is + 5 : is + 1;
+        mbidxmask0 = (is < 4) ? 0xF : 0xF0;
+        mbidxshift0 = (is < 4) ? 0 : 4;
+        mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
+        mbidxshift1 = (is < 4) ? 0 : 2;
+
+        sc = uint8_t((data_a[ib].scales[scidx0] & 0xF) | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1));
+        mbyte = uint8_t((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0 | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1));
+
         const FLOAT_TYPE d2 = dall * sc;
-        const FLOAT_TYPE m2 = dmin * m;
+        const FLOAT_TYPE m2 = dmin * mbyte;
 
         const uint8_t hm1 = uint8_t(1 << (2 * il    ));
         const uint8_t hm2 = uint8_t(1 << (2 * il + 1));
-        data_b[y_idx     ] = D_TYPE(d1 * FLOAT_TYPE((data_a[i].qs[qs_idx    ] & 0xF) + (((data_a[i].qh[qh_idx    ] & hm1) != 0) ? 16 : 0)) - m1);
-        data_b[y_idx +  1] = D_TYPE(d1 * FLOAT_TYPE((data_a[i].qs[qs_idx + 1] & 0xF) + (((data_a[i].qh[qh_idx + 1] & hm1) != 0) ? 16 : 0)) - m1);
-        data_b[y_idx + 32] = D_TYPE(d2 * FLOAT_TYPE((data_a[i].qs[qs_idx    ]  >> 4) + (((data_a[i].qh[qh_idx    ] & hm2) != 0) ? 16 : 0)) - m2);
-        data_b[y_idx + 33] = D_TYPE(d2 * FLOAT_TYPE((data_a[i].qs[qs_idx + 1]  >> 4) + (((data_a[i].qh[qh_idx + 1] & hm2) != 0) ? 16 : 0)) - m2);
+        data_b[y_idx     ] = D_TYPE(d1 * FLOAT_TYPE((data_a[ib].qs[qs_idx    ] & 0xF) + (((data_a[ib].qh[qh_idx    ] & hm1) != 0) ? 16 : 0)) - m1);
+        data_b[y_idx +  1] = D_TYPE(d1 * FLOAT_TYPE((data_a[ib].qs[qs_idx + 1] & 0xF) + (((data_a[ib].qh[qh_idx + 1] & hm1) != 0) ? 16 : 0)) - m1);
+        data_b[y_idx + 32] = D_TYPE(d2 * FLOAT_TYPE((data_a[ib].qs[qs_idx    ]  >> 4) + (((data_a[ib].qh[qh_idx    ] & hm2) != 0) ? 16 : 0)) - m2);
+        data_b[y_idx + 33] = D_TYPE(d2 * FLOAT_TYPE((data_a[ib].qs[qs_idx + 1]  >> 4) + (((data_a[ib].qh[qh_idx + 1] & hm2) != 0) ? 16 : 0)) - m2);
     }
 }
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp b/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp
index 7f608315b68..1426fde6597 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp
@@ -31,6 +31,8 @@ void main() {
     const uint y_offset = QUANT_R == 1 ? 1 : QUANT_K/2;
 
     vec2 v = dequantize(ib, iqs, 0);
+    const vec2 dm = get_dm(ib, 0);
+    v = v * dm.x + dm.y;
 
     data_d[d_offset + iybs + iqs           ] = D_TYPE(v.x);
     data_d[d_offset + iybs + iqs + y_offset] = D_TYPE(v.y);
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp
index 2d5b8e46613..187c31916d1 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp
@@ -31,27 +31,13 @@ void iter(inout FLOAT_TYPE temp[NUM_ROWS], const uint first_row, const uint num_
 
 #if K_PER_ITER == 8
 #if QUANT_R == 2
-    B_TYPE_VEC4 bv02 = data_b_v4[(b_offset + iybs + iqs) / 4];
-    B_TYPE_VEC4 bv13 = data_b_v4[(b_offset + iybs + iqs + y_offset) / 4];
-    FLOAT_TYPE b0 = FLOAT_TYPE(bv02.x);
-    FLOAT_TYPE b1 = FLOAT_TYPE(bv13.x);
-    FLOAT_TYPE b2 = FLOAT_TYPE(bv02.y);
-    FLOAT_TYPE b3 = FLOAT_TYPE(bv13.y);
-    FLOAT_TYPE b4 = FLOAT_TYPE(bv02.z);
-    FLOAT_TYPE b5 = FLOAT_TYPE(bv13.z);
-    FLOAT_TYPE b6 = FLOAT_TYPE(bv02.w);
-    FLOAT_TYPE b7 = FLOAT_TYPE(bv13.w);
+    const B_TYPE_VEC4 bv02 = data_b_v4[(b_offset + iybs + iqs) / 4];
+    const B_TYPE_VEC4 bv13 = data_b_v4[(b_offset + iybs + iqs + y_offset) / 4];
+    const vec4 bv0 = vec4(bv02.x, bv13.x, bv02.y, bv13.y);
+    const vec4 bv1 = vec4(bv02.z, bv13.z, bv02.w, bv13.w);
 #else
-    B_TYPE_VEC4 bv0 = data_b_v4[(b_offset + iybs + iqs) / 4];
-    B_TYPE_VEC4 bv1 = data_b_v4[(b_offset + iybs + iqs) / 4 + 1];
-    FLOAT_TYPE b0 = FLOAT_TYPE(bv0.x);
-    FLOAT_TYPE b1 = FLOAT_TYPE(bv0.y);
-    FLOAT_TYPE b2 = FLOAT_TYPE(bv0.z);
-    FLOAT_TYPE b3 = FLOAT_TYPE(bv0.w);
-    FLOAT_TYPE b4 = FLOAT_TYPE(bv1.x);
-    FLOAT_TYPE b5 = FLOAT_TYPE(bv1.y);
-    FLOAT_TYPE b6 = FLOAT_TYPE(bv1.z);
-    FLOAT_TYPE b7 = FLOAT_TYPE(bv1.w);
+    const vec4 bv0 = vec4(data_b_v4[(b_offset + iybs + iqs) / 4]);
+    const vec4 bv1 = vec4(data_b_v4[(b_offset + iybs + iqs) / 4 + 1]);
 #endif
 #else
     // Check if the second of the pair of elements is OOB, and don't fetch B or
@@ -67,22 +53,29 @@ void iter(inout FLOAT_TYPE temp[NUM_ROWS], const uint first_row, const uint num_
         b1 = FLOAT_TYPE(data_b[b_offset + iybs + iqs + y_offset]);
     }
 #endif
+    uint ibi = first_row*p.ncols;
     [[unroll]] for (uint n = 0; n < num_rows; ++n) {
-        const uint ib = ((first_row + n)*p.ncols + col)/QUANT_K; // block index
+        const uint ib = (ibi + col)/QUANT_K; // block index
+        ibi += p.ncols;
 
 #if K_PER_ITER == 8
-        const vec4 v = dequantize4(ib, iqs, a_offset);
-        const vec4 v2 = dequantize4(ib, iqs+(4/QUANT_R), a_offset);
+        vec4 v = dequantize4(ib, iqs, a_offset);
+        vec4 v2 = dequantize4(ib, iqs+(4/QUANT_R), a_offset);
+
+        const vec2 dm = get_dm(ib, a_offset);
+        if (dm.y != 0) { // quant has min component
+            v = v * dm.x + dm.y;
+            v2 = v2 * dm.x + dm.y;
+        }
 
         // matrix multiplication
-        temp[n] = fma(FLOAT_TYPE(v.x), b0, temp[n]);
-        temp[n] = fma(FLOAT_TYPE(v.y), b1, temp[n]);
-        temp[n] = fma(FLOAT_TYPE(v.z), b2, temp[n]);
-        temp[n] = fma(FLOAT_TYPE(v.w), b3, temp[n]);
-        temp[n] = fma(FLOAT_TYPE(v2.x), b4, temp[n]);
-        temp[n] = fma(FLOAT_TYPE(v2.y), b5, temp[n]);
-        temp[n] = fma(FLOAT_TYPE(v2.z), b6, temp[n]);
-        temp[n] = fma(FLOAT_TYPE(v2.w), b7, temp[n]);
+        FLOAT_TYPE rowtmp = dot(bv0, v);
+        rowtmp += dot(bv1, v2);
+
+        if (dm.y == 0)
+            rowtmp *= dm.x;
+
+        temp[n] += rowtmp;
 #else
         const vec2 v = dequantize(ib, iqs, a_offset);
 
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp
index 1a5350d99ea..138ad018411 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp
@@ -6,21 +6,15 @@
 layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
 
 layout (constant_id = 0) const uint BLOCK_SIZE = 32;
+layout (constant_id = 1) const uint NUM_ROWS = 1;
 
-shared FLOAT_TYPE tmp[BLOCK_SIZE];
-
-void main() {
-    const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
-
-    if (row >= p.stride_d) {
-        return;
-    }
+shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE];
 
+void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
     uint a_offset, b_offset, d_offset;
     get_offsets(a_offset, b_offset, d_offset);
 
     const uint num_blocks_per_row = p.ncols / QUANT_K;
-    const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row;
 
     // 16 threads are used to process each block
     const uint it_size = gl_WorkGroupSize.x/16;
@@ -38,15 +32,15 @@ void main() {
     const uint s_offset = 8*v_im;
     const uint y_offset = 128*v_im + l0;
 
-    FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp
+    FLOAT_TYPE temp[NUM_ROWS];
+
+    [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
+        temp[i] = FLOAT_TYPE(0);
+    }
 
     [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
         const uint y_idx = i * QUANT_K + y_offset;
 
-        f16vec2 d = data_a[ib0 + i].d;
-        const FLOAT_TYPE dall = d.x;
-        const FLOAT_TYPE dmin = d.y;
-
         B_TYPE_VEC2 b0 = data_b_v2[(b_offset + y_idx) / 2 + 0];
         B_TYPE_VEC2 b16 = data_b_v2[(b_offset + y_idx) / 2 + 8];
         B_TYPE_VEC2 b32 = data_b_v2[(b_offset + y_idx) / 2 + 16];
@@ -56,58 +50,84 @@ void main() {
         B_TYPE_VEC2 b96 = data_b_v2[(b_offset + y_idx) / 2 + 48];
         B_TYPE_VEC2 b112 = data_b_v2[(b_offset + y_idx) / 2 + 56];
 
-        uint32_t s0_u32 = data_a_packed32[ib0 + i].scales[s_offset / 4 + 0];
-        uint32_t s4_u32 = data_a_packed32[ib0 + i].scales[s_offset / 4 + 1];
-
-        uint32_t s0_lo4_u32 = s0_u32 & 0x0F0F0F0F;
-        uint32_t s0_hi4_u32 = (s0_u32 >> 4) & 0x0F0F0F0F;
-        uint32_t s4_lo4_u32 = s4_u32 & 0x0F0F0F0F;
-        uint32_t s4_hi4_u32 = (s4_u32 >> 4) & 0x0F0F0F0F;
-
-        uvec4 s0_lo4 = uvec4(unpack8(s0_lo4_u32));
-        uvec4 s4_lo4 = uvec4(unpack8(s4_lo4_u32));
-        uvec4 s0_hi4 = uvec4(unpack8(s0_hi4_u32));
-        uvec4 s4_hi4 = uvec4(unpack8(s4_hi4_u32));
-
-        uint16_t qs0_u16 = data_a_packed16[ib0 + i].qs[q_offset / 2 + 0];
-        uint16_t qs16_u16 = data_a_packed16[ib0 + i].qs[q_offset / 2 + 8];
-        uvec2 qs0 =  uvec2(unpack8(qs0_u16));
-        uvec2 qs16 = uvec2(unpack8(qs16_u16));
-
-        FLOAT_TYPE sum1 = FLOAT_TYPE(0.0);
-        FLOAT_TYPE sum2 = FLOAT_TYPE(0.0);
-        [[unroll]] for (int l = 0; l < 2; ++l) {
-            sum1 = fma(FLOAT_TYPE(b0[l]),   FLOAT_TYPE(s0_lo4[0]) * FLOAT_TYPE((qs0[l]  >> 0) & 3),
-                   fma(FLOAT_TYPE(b16[l]),  FLOAT_TYPE(s0_lo4[1]) * FLOAT_TYPE((qs16[l] >> 0) & 3),
-                   fma(FLOAT_TYPE(b32[l]),  FLOAT_TYPE(s0_lo4[2]) * FLOAT_TYPE((qs0[l]  >> 2) & 3),
-                   fma(FLOAT_TYPE(b48[l]),  FLOAT_TYPE(s0_lo4[3]) * FLOAT_TYPE((qs16[l] >> 2) & 3),
-                   fma(FLOAT_TYPE(b64[l]),  FLOAT_TYPE(s4_lo4[0]) * FLOAT_TYPE((qs0[l]  >> 4) & 3),
-                   fma(FLOAT_TYPE(b80[l]),  FLOAT_TYPE(s4_lo4[1]) * FLOAT_TYPE((qs16[l] >> 4) & 3),
-                   fma(FLOAT_TYPE(b96[l]),  FLOAT_TYPE(s4_lo4[2]) * FLOAT_TYPE((qs0[l]  >> 6) & 3),
-                   fma(FLOAT_TYPE(b112[l]), FLOAT_TYPE(s4_lo4[3]) * FLOAT_TYPE((qs16[l] >> 6) & 3), sum1))))))));
-            sum2 = fma(FLOAT_TYPE(b0[l]),   FLOAT_TYPE(s0_hi4[0]),
-                   fma(FLOAT_TYPE(b16[l]),  FLOAT_TYPE(s0_hi4[1]),
-                   fma(FLOAT_TYPE(b32[l]),  FLOAT_TYPE(s0_hi4[2]),
-                   fma(FLOAT_TYPE(b48[l]),  FLOAT_TYPE(s0_hi4[3]),
-                   fma(FLOAT_TYPE(b64[l]),  FLOAT_TYPE(s4_hi4[0]),
-                   fma(FLOAT_TYPE(b80[l]),  FLOAT_TYPE(s4_hi4[1]),
-                   fma(FLOAT_TYPE(b96[l]),  FLOAT_TYPE(s4_hi4[2]),
-                   fma(FLOAT_TYPE(b112[l]), FLOAT_TYPE(s4_hi4[3]), sum2))))))));
+        [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+            const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
+            f16vec2 d = data_a[ib0 + i].d;
+            const FLOAT_TYPE dall = d.x;
+            const FLOAT_TYPE dmin = d.y;
+
+            uint32_t s0_u32 = data_a_packed32[ib0 + i].scales[s_offset / 4 + 0];
+            uint32_t s4_u32 = data_a_packed32[ib0 + i].scales[s_offset / 4 + 1];
+
+            uint32_t s0_lo4_u32 = s0_u32 & 0x0F0F0F0F;
+            uint32_t s0_hi4_u32 = (s0_u32 >> 4) & 0x0F0F0F0F;
+            uint32_t s4_lo4_u32 = s4_u32 & 0x0F0F0F0F;
+            uint32_t s4_hi4_u32 = (s4_u32 >> 4) & 0x0F0F0F0F;
+
+            uvec4 s0_lo4 = uvec4(unpack8(s0_lo4_u32));
+            uvec4 s4_lo4 = uvec4(unpack8(s4_lo4_u32));
+            uvec4 s0_hi4 = uvec4(unpack8(s0_hi4_u32));
+            uvec4 s4_hi4 = uvec4(unpack8(s4_hi4_u32));
+
+            uint16_t qs0_u16 = data_a_packed16[ib0 + i].qs[q_offset / 2 + 0];
+            uint16_t qs16_u16 = data_a_packed16[ib0 + i].qs[q_offset / 2 + 8];
+            uvec2 qs0 =  uvec2(unpack8(qs0_u16));
+            uvec2 qs16 = uvec2(unpack8(qs16_u16));
+
+            FLOAT_TYPE sum1 = FLOAT_TYPE(0.0);
+            FLOAT_TYPE sum2 = FLOAT_TYPE(0.0);
+            [[unroll]] for (int l = 0; l < 2; ++l) {
+                sum1 = fma(FLOAT_TYPE(b0[l]),   FLOAT_TYPE(s0_lo4[0]) * FLOAT_TYPE((qs0[l]  >> 0) & 3),
+                       fma(FLOAT_TYPE(b16[l]),  FLOAT_TYPE(s0_lo4[1]) * FLOAT_TYPE((qs16[l] >> 0) & 3),
+                       fma(FLOAT_TYPE(b32[l]),  FLOAT_TYPE(s0_lo4[2]) * FLOAT_TYPE((qs0[l]  >> 2) & 3),
+                       fma(FLOAT_TYPE(b48[l]),  FLOAT_TYPE(s0_lo4[3]) * FLOAT_TYPE((qs16[l] >> 2) & 3),
+                       fma(FLOAT_TYPE(b64[l]),  FLOAT_TYPE(s4_lo4[0]) * FLOAT_TYPE((qs0[l]  >> 4) & 3),
+                       fma(FLOAT_TYPE(b80[l]),  FLOAT_TYPE(s4_lo4[1]) * FLOAT_TYPE((qs16[l] >> 4) & 3),
+                       fma(FLOAT_TYPE(b96[l]),  FLOAT_TYPE(s4_lo4[2]) * FLOAT_TYPE((qs0[l]  >> 6) & 3),
+                       fma(FLOAT_TYPE(b112[l]), FLOAT_TYPE(s4_lo4[3]) * FLOAT_TYPE((qs16[l] >> 6) & 3), sum1))))))));
+                sum2 = fma(FLOAT_TYPE(b0[l]),   FLOAT_TYPE(s0_hi4[0]),
+                       fma(FLOAT_TYPE(b16[l]),  FLOAT_TYPE(s0_hi4[1]),
+                       fma(FLOAT_TYPE(b32[l]),  FLOAT_TYPE(s0_hi4[2]),
+                       fma(FLOAT_TYPE(b48[l]),  FLOAT_TYPE(s0_hi4[3]),
+                       fma(FLOAT_TYPE(b64[l]),  FLOAT_TYPE(s4_hi4[0]),
+                       fma(FLOAT_TYPE(b80[l]),  FLOAT_TYPE(s4_hi4[1]),
+                       fma(FLOAT_TYPE(b96[l]),  FLOAT_TYPE(s4_hi4[2]),
+                       fma(FLOAT_TYPE(b112[l]), FLOAT_TYPE(s4_hi4[3]), sum2))))))));
+            }
+            temp[n] = fma(dall, sum1, fma(-dmin, sum2, temp[n]));
         }
-        temp = fma(dall, sum1, fma(-dmin, sum2, temp));
     }
 
-    tmp[gl_LocalInvocationID.x] = temp;
-
     // sum up partial sums and write back result
+    [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+        tmpsh[n][tid] = temp[n];
+    }
     barrier();
-    [[unroll]] for (uint s = gl_WorkGroupSize.x/2; s > 0; s >>= 1) {
+    [[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) {
         if (tid < s) {
-            tmp[tid] += tmp[tid + s];
+            [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+                tmpsh[n][tid] += tmpsh[n][tid + s];
+            }
         }
         barrier();
     }
     if (tid == 0) {
-        data_d[d_offset + row] = D_TYPE(tmp[0]);
+        [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+            data_d[d_offset + first_row + n] = D_TYPE(tmpsh[n][0]);
+        }
+    }
+}
+
+void main() {
+    const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
+
+    // do NUM_ROWS at a time, unless there aren't enough remaining rows
+    if (first_row + NUM_ROWS <= p.stride_d) {
+        compute_outputs(first_row, NUM_ROWS);
+    } else {
+        if (first_row >= p.stride_d) {
+            return;
+        }
+        compute_outputs(first_row, p.stride_d - first_row);
     }
 }
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp
index b19c3811136..82ec42d257d 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp
@@ -6,21 +6,15 @@
 layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
 
 layout (constant_id = 0) const uint BLOCK_SIZE = 32;
+layout (constant_id = 1) const uint NUM_ROWS = 1;
 
-shared FLOAT_TYPE tmp[BLOCK_SIZE];
-
-void main() {
-    const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
-
-    if (row >= p.stride_d) {
-        return;
-    }
+shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE];
 
+void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
     uint a_offset, b_offset, d_offset;
     get_offsets(a_offset, b_offset, d_offset);
 
     const uint num_blocks_per_row = p.ncols / QUANT_K;
-    const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row;
 
     // 16 threads are used to process each block
     const uint it_size = gl_WorkGroupSize.x/16;
@@ -35,19 +29,21 @@ void main() {
 
     const uint8_t m = uint8_t(1 << (4 * v_im));
 
-    const uint l0 = 2*v_in;                                // 0...15
+    const uint l0 = 2*v_in;                                 // 0...15
     const uint q_offset = 32*v_im + l0;
     const uint y_offset = 128*v_im + l0;
 
-    FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp
+    FLOAT_TYPE temp[NUM_ROWS];
+
+    [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
+        temp[i] = FLOAT_TYPE(0);
+    }
 
     const uint s_shift = 4 * v_im;
 
     [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
         const uint y_idx = i * QUANT_K + y_offset;
 
-        const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
-
         B_TYPE_VEC2 b0 = data_b_v2[(b_offset + y_idx) / 2 + 0];
         B_TYPE_VEC2 b16 = data_b_v2[(b_offset + y_idx) / 2 + 8];
         B_TYPE_VEC2 b32 = data_b_v2[(b_offset + y_idx) / 2 + 16];
@@ -57,44 +53,68 @@ void main() {
         B_TYPE_VEC2 b96 = data_b_v2[(b_offset + y_idx) / 2 + 48];
         B_TYPE_VEC2 b112 = data_b_v2[(b_offset + y_idx) / 2 + 56];
 
-        uint16_t s0_16 = data_a_packed16[ib0 + i].scales[0];
-        uint16_t s2_16 = data_a_packed16[ib0 + i].scales[1];
-        uint16_t s4_16 = data_a_packed16[ib0 + i].scales[2];
-        uint16_t s6_16 = data_a_packed16[ib0 + i].scales[3];
-        uint16_t s8_16 = data_a_packed16[ib0 + i].scales[4];
-        uint16_t s10_16 = data_a_packed16[ib0 + i].scales[5];
-        u8vec2 s0 = unpack8(s0_16);
-        u8vec2 s2 = unpack8(s2_16);
-        u8vec2 s4 = unpack8(s4_16);
-        u8vec2 s6 = unpack8(s6_16);
-        u8vec2 s8 = unpack8(s8_16);
-        u8vec2 s10 = unpack8(s10_16);
-
-        FLOAT_TYPE sum = FLOAT_TYPE(0.0);
-        [[unroll]] for (int l = 0; l < 2; ++l) {
-            sum = fma(FLOAT_TYPE(b0[l])   * FLOAT_TYPE(int8_t(((s0[0] >> s_shift) & 0xF) | ((s8[0]  >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l   ]     ) & 3) - (((data_a[ib0 + i].hmask[l0 + l   ] & (m << 0)) != 0) ? 0 : 4)),
-                  fma(FLOAT_TYPE(b32[l])  * FLOAT_TYPE(int8_t(((s2[0] >> s_shift) & 0xF) | ((s10[0] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l   ] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l   ] & (m << 1)) != 0) ? 0 : 4)),
-                  fma(FLOAT_TYPE(b64[l])  * FLOAT_TYPE(int8_t(((s4[0] >> s_shift) & 0xF) | ((s8[0]  >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l   ] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l   ] & (m << 2)) != 0) ? 0 : 4)),
-                  fma(FLOAT_TYPE(b96[l])  * FLOAT_TYPE(int8_t(((s6[0] >> s_shift) & 0xF) | ((s10[0] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l   ] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l   ] & (m << 3)) != 0) ? 0 : 4)),
-                  fma(FLOAT_TYPE(b16[l])  * FLOAT_TYPE(int8_t(((s0[1] >> s_shift) & 0xF) | ((s8[1]  >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16]     ) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 0)) != 0) ? 0 : 4)),
-                  fma(FLOAT_TYPE(b48[l])  * FLOAT_TYPE(int8_t(((s2[1] >> s_shift) & 0xF) | ((s10[1] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 1)) != 0) ? 0 : 4)),
-                  fma(FLOAT_TYPE(b80[l])  * FLOAT_TYPE(int8_t(((s4[1] >> s_shift) & 0xF) | ((s8[1]  >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 2)) != 0) ? 0 : 4)),
-                  fma(FLOAT_TYPE(b112[l]) * FLOAT_TYPE(int8_t(((s6[1] >> s_shift) & 0xF) | ((s10[1] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 3)) != 0) ? 0 : 4)), sum))))))));
+        [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+            const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
+            const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
+
+            uint16_t s0_16 = data_a_packed16[ib0 + i].scales[0];
+            uint16_t s2_16 = data_a_packed16[ib0 + i].scales[1];
+            uint16_t s4_16 = data_a_packed16[ib0 + i].scales[2];
+            uint16_t s6_16 = data_a_packed16[ib0 + i].scales[3];
+            uint16_t s8_16 = data_a_packed16[ib0 + i].scales[4];
+            uint16_t s10_16 = data_a_packed16[ib0 + i].scales[5];
+            u8vec2 s0 = unpack8(s0_16);
+            u8vec2 s2 = unpack8(s2_16);
+            u8vec2 s4 = unpack8(s4_16);
+            u8vec2 s6 = unpack8(s6_16);
+            u8vec2 s8 = unpack8(s8_16);
+            u8vec2 s10 = unpack8(s10_16);
+
+            FLOAT_TYPE sum = FLOAT_TYPE(0.0);
+            [[unroll]] for (int l = 0; l < 2; ++l) {
+                sum = fma(FLOAT_TYPE(b0[l])   * FLOAT_TYPE(int8_t(((s0[0] >> s_shift) & 0xF) | ((s8[0]  >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l   ]     ) & 3) - (((data_a[ib0 + i].hmask[l0 + l   ] & (m << 0)) != 0) ? 0 : 4)),
+                      fma(FLOAT_TYPE(b32[l])  * FLOAT_TYPE(int8_t(((s2[0] >> s_shift) & 0xF) | ((s10[0] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l   ] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l   ] & (m << 1)) != 0) ? 0 : 4)),
+                      fma(FLOAT_TYPE(b64[l])  * FLOAT_TYPE(int8_t(((s4[0] >> s_shift) & 0xF) | ((s8[0]  >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l   ] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l   ] & (m << 2)) != 0) ? 0 : 4)),
+                      fma(FLOAT_TYPE(b96[l])  * FLOAT_TYPE(int8_t(((s6[0] >> s_shift) & 0xF) | ((s10[0] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l   ] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l   ] & (m << 3)) != 0) ? 0 : 4)),
+                      fma(FLOAT_TYPE(b16[l])  * FLOAT_TYPE(int8_t(((s0[1] >> s_shift) & 0xF) | ((s8[1]  >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16]     ) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 0)) != 0) ? 0 : 4)),
+                      fma(FLOAT_TYPE(b48[l])  * FLOAT_TYPE(int8_t(((s2[1] >> s_shift) & 0xF) | ((s10[1] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 1)) != 0) ? 0 : 4)),
+                      fma(FLOAT_TYPE(b80[l])  * FLOAT_TYPE(int8_t(((s4[1] >> s_shift) & 0xF) | ((s8[1]  >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 2)) != 0) ? 0 : 4)),
+                      fma(FLOAT_TYPE(b112[l]) * FLOAT_TYPE(int8_t(((s6[1] >> s_shift) & 0xF) | ((s10[1] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 3)) != 0) ? 0 : 4)), sum))))))));
+            }
+            temp[n] = fma(d, sum, temp[n]);
         }
-        temp = fma(d, sum, temp);
     }
 
-    tmp[gl_LocalInvocationID.x] = temp;
-
     // sum up partial sums and write back result
+    [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+        tmpsh[n][tid] = temp[n];
+    }
     barrier();
-    [[unroll]] for (uint s = gl_WorkGroupSize.x/2; s > 0; s >>= 1) {
+    [[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) {
         if (tid < s) {
-            tmp[tid] += tmp[tid + s];
+            [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+                tmpsh[n][tid] += tmpsh[n][tid + s];
+            }
         }
         barrier();
     }
     if (tid == 0) {
-        data_d[d_offset + row] = D_TYPE(tmp[0]);
+        [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+            data_d[d_offset + first_row + n] = D_TYPE(tmpsh[n][0]);
+        }
+    }
+}
+
+void main() {
+    const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
+
+    // do NUM_ROWS at a time, unless there aren't enough remaining rows
+    if (first_row + NUM_ROWS <= p.stride_d) {
+        compute_outputs(first_row, NUM_ROWS);
+    } else {
+        if (first_row >= p.stride_d) {
+            return;
+        }
+        compute_outputs(first_row, p.stride_d - first_row);
     }
 }
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp
index b86d28589c6..677c207a842 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp
@@ -7,21 +7,15 @@
 layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
 
 layout (constant_id = 0) const uint BLOCK_SIZE = 32;
+layout (constant_id = 1) const uint NUM_ROWS = 1;
 
-shared FLOAT_TYPE tmp[BLOCK_SIZE];
-
-void main() {
-    const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
-
-    if (row >= p.stride_d) {
-        return;
-    }
+shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE];
 
+void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
     uint a_offset, b_offset, d_offset;
     get_offsets(a_offset, b_offset, d_offset);
 
     const uint num_blocks_per_row = p.ncols / QUANT_K;
-    const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row;
 
     // 16 threads are used to process each block
     const uint it_size = gl_WorkGroupSize.x/16;
@@ -31,8 +25,8 @@ void main() {
 
     const uint step = 4;
 
-    const uint il = itid/step;                               // 0...3
-    const uint ir = itid - step*il;                          // 0...7 or 0...3
+    const uint il = itid/step;                      // 0...3
+    const uint ir = itid - step*il;                 // 0...7 or 0...3
     const uint n =  4;
 
     const uint v_im = il / 2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
@@ -42,90 +36,116 @@ void main() {
     const uint q_offset = 32*v_im + l0;
     const uint y_offset = 64*v_im + l0;
 
-    FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp
+    FLOAT_TYPE temp[NUM_ROWS];
+
+    [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
+        temp[i] = FLOAT_TYPE(0);
+    }
 
     [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
         const uint y1_idx = i * QUANT_K + y_offset;
         const uint y2_idx = y1_idx + 128;
 
-        f16vec2 d = data_a[ib0 + i].d;
-        const FLOAT_TYPE dall = FLOAT_TYPE(d.x);
-        const FLOAT_TYPE dmin = FLOAT_TYPE(d.y);
-
-        uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im    ];
-        uint32_t scale4_u32 = data_a_packed16[ib0 + i].scales[v_im + 2];
-        uint32_t scale8_u32 = data_a_packed16[ib0 + i].scales[v_im + 4];
-        uvec4 scale0 = uvec4(unpack8(scale0_u32));
-        uvec4 scale4 = uvec4(unpack8(scale4_u32));
-        uvec4 scale8 = uvec4(unpack8(scale8_u32));
-
-        const uint32_t sc0 = (  scale0.x       & 0x3f);
-        const uint32_t sc1 = (  scale0.y       & 0x3f);
-        const uint32_t sc2 = (  scale4.x       & 0x3f);
-        const uint32_t sc3 = (  scale4.y       & 0x3f);
-        const uint32_t sc4 = (( scale8.x       & 0x0f) | ((scale0.x & 0xc0) >> 2));
-        const uint32_t sc5 = (( scale8.y       & 0x0f) | ((scale0.y & 0xc0) >> 2));
-        const uint32_t sc6 = (((scale8.x >> 4) & 0x0f) | ((scale4.x & 0xc0) >> 2));
-        const uint32_t sc7 = (((scale8.y >> 4) & 0x0f) | ((scale4.y & 0xc0) >> 2));
-
-        uint32_t qs0_u32 = data_a_packed32[ib0 + i].qs[q_offset / 4];
-        uint32_t qs64_u32 = data_a_packed32[ib0 + i].qs[q_offset / 4 + 16];
-
-        uint32_t qs0_u32_lo4 = qs0_u32 & 0x0F0F0F0F;
-        uint32_t qs0_u32_hi4 = (qs0_u32 >> 4) & 0x0F0F0F0F;
-        uint32_t qs64_u32_lo4 = qs64_u32 & 0x0F0F0F0F;
-        uint32_t qs64_u32_hi4 = (qs64_u32 >> 4) & 0x0F0F0F0F;
-
-        uvec4 qs0_lo4 = uvec4(unpack8(qs0_u32_lo4));
-        uvec4 qs64_lo4 = uvec4(unpack8(qs64_u32_lo4));
-        uvec4 qs0_hi4 = uvec4(unpack8(qs0_u32_hi4));
-        uvec4 qs64_hi4 = uvec4(unpack8(qs64_u32_hi4));
-
-        const uint32_t q4_0  = qs0_lo4.x;
-        const uint32_t q4_1  = qs0_lo4.y;
-        const uint32_t q4_2  = qs0_lo4.z;
-        const uint32_t q4_3  = qs0_lo4.w;
-        const uint32_t q4_4  = qs0_hi4.x;
-        const uint32_t q4_5  = qs0_hi4.y;
-        const uint32_t q4_6  = qs0_hi4.z;
-        const uint32_t q4_7  = qs0_hi4.w;
-        const uint32_t q4_8  = qs64_lo4.x;
-        const uint32_t q4_9  = qs64_lo4.y;
-        const uint32_t q4_10 = qs64_lo4.z;
-        const uint32_t q4_11 = qs64_lo4.w;
-        const uint32_t q4_12 = qs64_hi4.x;
-        const uint32_t q4_13 = qs64_hi4.y;
-        const uint32_t q4_14 = qs64_hi4.z;
-        const uint32_t q4_15 = qs64_hi4.w;
-
         B_TYPE_VEC4 by10 =  data_b_v4[(b_offset + y1_idx) / 4];
         B_TYPE_VEC4 by132 = data_b_v4[(b_offset + y1_idx) / 4 + 8];
         B_TYPE_VEC4 by20 =  data_b_v4[(b_offset + y2_idx) / 4];
         B_TYPE_VEC4 by232 = data_b_v4[(b_offset + y2_idx) / 4 + 8];
 
-        const FLOAT_TYPE sx = fma(FLOAT_TYPE(by10.x),      q4_0,  fma(FLOAT_TYPE(by10.y),  q4_1,  fma(FLOAT_TYPE(by10.z),  q4_2,  FLOAT_TYPE(by10.w) *  q4_3)));
-        const FLOAT_TYPE sy = fma(FLOAT_TYPE(by132.x),     q4_4,  fma(FLOAT_TYPE(by132.y), q4_5,  fma(FLOAT_TYPE(by132.z), q4_6,  FLOAT_TYPE(by132.w) * q4_7)));
-        const FLOAT_TYPE sz = fma(FLOAT_TYPE(by20.x),      q4_8,  fma(FLOAT_TYPE(by20.y),  q4_9,  fma(FLOAT_TYPE(by20.z),  q4_10, FLOAT_TYPE(by20.w) *  q4_11)));
-        const FLOAT_TYPE sw = fma(FLOAT_TYPE(by232.x),     q4_12, fma(FLOAT_TYPE(by232.y), q4_13, fma(FLOAT_TYPE(by232.z), q4_14, FLOAT_TYPE(by232.w) * q4_15)));
-        const FLOAT_TYPE smin =
-            fma(FLOAT_TYPE(by10.x), sc2, fma(FLOAT_TYPE(by132.x), sc3, fma(FLOAT_TYPE(by20.x), sc6, fma(FLOAT_TYPE(by232.x), sc7,
-            fma(FLOAT_TYPE(by10.y), sc2, fma(FLOAT_TYPE(by132.y), sc3, fma(FLOAT_TYPE(by20.y), sc6, fma(FLOAT_TYPE(by232.y), sc7,
-            fma(FLOAT_TYPE(by10.z), sc2, fma(FLOAT_TYPE(by132.z), sc3, fma(FLOAT_TYPE(by20.z), sc6, fma(FLOAT_TYPE(by232.z), sc7,
-            fma(FLOAT_TYPE(by10.w), sc2, fma(FLOAT_TYPE(by132.w), sc3, fma(FLOAT_TYPE(by20.w), sc6,     FLOAT_TYPE(by232.w) * sc7)))))))))))))));
-        temp = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp));
+        [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+            const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
+            f16vec2 d = data_a[ib0 + i].d;
+            const FLOAT_TYPE dall = FLOAT_TYPE(d.x);
+            const FLOAT_TYPE dmin = FLOAT_TYPE(d.y);
+
+            uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im    ];
+            uint32_t scale4_u32 = data_a_packed16[ib0 + i].scales[v_im + 2];
+            uint32_t scale8_u32 = data_a_packed16[ib0 + i].scales[v_im + 4];
+            uvec4 scale0 = uvec4(unpack8(scale0_u32));
+            uvec4 scale4 = uvec4(unpack8(scale4_u32));
+            uvec4 scale8 = uvec4(unpack8(scale8_u32));
+
+            const uint32_t sc0 = (  scale0.x       & 0x3f);
+            const uint32_t sc1 = (  scale0.y       & 0x3f);
+            const uint32_t sc2 = (  scale4.x       & 0x3f);
+            const uint32_t sc3 = (  scale4.y       & 0x3f);
+            const uint32_t sc4 = (( scale8.x       & 0x0f) | ((scale0.x & 0xc0) >> 2));
+            const uint32_t sc5 = (( scale8.y       & 0x0f) | ((scale0.y & 0xc0) >> 2));
+            const uint32_t sc6 = (((scale8.x >> 4) & 0x0f) | ((scale4.x & 0xc0) >> 2));
+            const uint32_t sc7 = (((scale8.y >> 4) & 0x0f) | ((scale4.y & 0xc0) >> 2));
+
+            uint32_t qs0_u32 = data_a_packed32[ib0 + i].qs[q_offset / 4];
+            uint32_t qs64_u32 = data_a_packed32[ib0 + i].qs[q_offset / 4 + 16];
+
+            uint32_t qs0_u32_lo4 = qs0_u32 & 0x0F0F0F0F;
+            uint32_t qs0_u32_hi4 = (qs0_u32 >> 4) & 0x0F0F0F0F;
+            uint32_t qs64_u32_lo4 = qs64_u32 & 0x0F0F0F0F;
+            uint32_t qs64_u32_hi4 = (qs64_u32 >> 4) & 0x0F0F0F0F;
+
+            uvec4 qs0_lo4 = uvec4(unpack8(qs0_u32_lo4));
+            uvec4 qs64_lo4 = uvec4(unpack8(qs64_u32_lo4));
+            uvec4 qs0_hi4 = uvec4(unpack8(qs0_u32_hi4));
+            uvec4 qs64_hi4 = uvec4(unpack8(qs64_u32_hi4));
+
+            const uint32_t q4_0  = qs0_lo4.x;
+            const uint32_t q4_1  = qs0_lo4.y;
+            const uint32_t q4_2  = qs0_lo4.z;
+            const uint32_t q4_3  = qs0_lo4.w;
+            const uint32_t q4_4  = qs0_hi4.x;
+            const uint32_t q4_5  = qs0_hi4.y;
+            const uint32_t q4_6  = qs0_hi4.z;
+            const uint32_t q4_7  = qs0_hi4.w;
+            const uint32_t q4_8  = qs64_lo4.x;
+            const uint32_t q4_9  = qs64_lo4.y;
+            const uint32_t q4_10 = qs64_lo4.z;
+            const uint32_t q4_11 = qs64_lo4.w;
+            const uint32_t q4_12 = qs64_hi4.x;
+            const uint32_t q4_13 = qs64_hi4.y;
+            const uint32_t q4_14 = qs64_hi4.z;
+            const uint32_t q4_15 = qs64_hi4.w;
+
+            const FLOAT_TYPE sx = fma(FLOAT_TYPE(by10.x),      q4_0,  fma(FLOAT_TYPE(by10.y),  q4_1,  fma(FLOAT_TYPE(by10.z),  q4_2,  FLOAT_TYPE(by10.w) *  q4_3)));
+            const FLOAT_TYPE sy = fma(FLOAT_TYPE(by132.x),     q4_4,  fma(FLOAT_TYPE(by132.y), q4_5,  fma(FLOAT_TYPE(by132.z), q4_6,  FLOAT_TYPE(by132.w) * q4_7)));
+            const FLOAT_TYPE sz = fma(FLOAT_TYPE(by20.x),      q4_8,  fma(FLOAT_TYPE(by20.y),  q4_9,  fma(FLOAT_TYPE(by20.z),  q4_10, FLOAT_TYPE(by20.w) *  q4_11)));
+            const FLOAT_TYPE sw = fma(FLOAT_TYPE(by232.x),     q4_12, fma(FLOAT_TYPE(by232.y), q4_13, fma(FLOAT_TYPE(by232.z), q4_14, FLOAT_TYPE(by232.w) * q4_15)));
+            const FLOAT_TYPE smin =
+                fma(FLOAT_TYPE(by10.x), sc2, fma(FLOAT_TYPE(by132.x), sc3, fma(FLOAT_TYPE(by20.x), sc6, fma(FLOAT_TYPE(by232.x), sc7,
+                fma(FLOAT_TYPE(by10.y), sc2, fma(FLOAT_TYPE(by132.y), sc3, fma(FLOAT_TYPE(by20.y), sc6, fma(FLOAT_TYPE(by232.y), sc7,
+                fma(FLOAT_TYPE(by10.z), sc2, fma(FLOAT_TYPE(by132.z), sc3, fma(FLOAT_TYPE(by20.z), sc6, fma(FLOAT_TYPE(by232.z), sc7,
+                fma(FLOAT_TYPE(by10.w), sc2, fma(FLOAT_TYPE(by132.w), sc3, fma(FLOAT_TYPE(by20.w), sc6,     FLOAT_TYPE(by232.w) * sc7)))))))))))))));
+            temp[n] = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp[n]));
+        }
     }
 
-    tmp[gl_LocalInvocationID.x] = temp;
-
     // sum up partial sums and write back result
+    [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+        tmpsh[n][tid] = temp[n];
+    }
     barrier();
-    [[unroll]] for (uint s = gl_WorkGroupSize.x/2; s > 0; s >>= 1) {
+    [[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) {
         if (tid < s) {
-            tmp[tid] += tmp[tid + s];
+            [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+                tmpsh[n][tid] += tmpsh[n][tid + s];
+            }
         }
         barrier();
     }
     if (tid == 0) {
-        data_d[d_offset + row] = D_TYPE(tmp[0]);
+        [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+            data_d[d_offset + first_row + n] = D_TYPE(tmpsh[n][0]);
+        }
+    }
+}
+
+void main() {
+    const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
+
+    // do NUM_ROWS at a time, unless there aren't enough remaining rows
+    if (first_row + NUM_ROWS <= p.stride_d) {
+        compute_outputs(first_row, NUM_ROWS);
+    } else {
+        if (first_row >= p.stride_d) {
+            return;
+        }
+        compute_outputs(first_row, p.stride_d - first_row);
     }
 }
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp
index fd243cf9161..ed3c25d891c 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp
@@ -7,21 +7,15 @@
 layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
 
 layout (constant_id = 0) const uint BLOCK_SIZE = 32;
+layout (constant_id = 1) const uint NUM_ROWS = 1;
 
-shared FLOAT_TYPE tmp[BLOCK_SIZE];
-
-void main() {
-    const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
-
-    if (row >= p.stride_d) {
-        return;
-    }
+shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE];
 
+void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
     uint a_offset, b_offset, d_offset;
     get_offsets(a_offset, b_offset, d_offset);
 
     const uint num_blocks_per_row = p.ncols / QUANT_K;
-    const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row;
 
     // 16 threads are used to process each block
     const uint it_size = gl_WorkGroupSize.x/16;
@@ -39,74 +33,16 @@ void main() {
     const uint q_offset = 32*v_im + l0;
     const uint y_offset = 64*v_im + l0;
 
-    FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp
+    FLOAT_TYPE temp[NUM_ROWS];
+
+    [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
+        temp[i] = FLOAT_TYPE(0);
+    }
 
     [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
         const uint y1_idx = i * QUANT_K + y_offset;
         const uint y2_idx = y1_idx + 128;
 
-        f16vec2 d = data_a[ib0 + i].d;
-        const FLOAT_TYPE dall = FLOAT_TYPE(d.x);
-        const FLOAT_TYPE dmin = FLOAT_TYPE(d.y);
-
-        uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im    ];
-        uint32_t scale4_u32 = data_a_packed16[ib0 + i].scales[v_im + 2];
-        uint32_t scale8_u32 = data_a_packed16[ib0 + i].scales[v_im + 4];
-        uvec4 scale0 = uvec4(unpack8(scale0_u32));
-        uvec4 scale4 = uvec4(unpack8(scale4_u32));
-        uvec4 scale8 = uvec4(unpack8(scale8_u32));
-
-        const uint32_t sc0 = (  scale0.x       & 0x3f);
-        const uint32_t sc1 = (  scale0.y       & 0x3f);
-        const uint32_t sc2 = (  scale4.x       & 0x3f);
-        const uint32_t sc3 = (  scale4.y       & 0x3f);
-        const uint32_t sc4 = (( scale8.x       & 0x0f) | ((scale0.x & 0xc0) >> 2));
-        const uint32_t sc5 = (( scale8.y       & 0x0f) | ((scale0.y & 0xc0) >> 2));
-        const uint32_t sc6 = (((scale8.x >> 4) & 0x0f) | ((scale4.x & 0xc0) >> 2));
-        const uint32_t sc7 = (((scale8.y >> 4) & 0x0f) | ((scale4.y & 0xc0) >> 2));
-
-        uint32_t qs0_16_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 8]) << 16);
-        uint32_t qs64_80_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 32]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 40]) << 16);
-
-        uint32_t qs0_16_u32_lo4 = qs0_16_u32 & 0x0F0F0F0F;
-        uint32_t qs0_16_u32_hi4 = (qs0_16_u32 >> 4) & 0x0F0F0F0F;
-        uint32_t qs64_80_u32_lo4 = qs64_80_u32 & 0x0F0F0F0F;
-        uint32_t qs64_80_u32_hi4 = (qs64_80_u32 >> 4) & 0x0F0F0F0F;
-
-        uint32_t qh = pack32(u16vec2(data_a_packed16[ib0 + i].qh[l0 / 2], data_a_packed16[ib0 + i].qh[l0 / 2 + 8]));
-
-        uint32_t qs0_16_lo4_offset16 = ((qh >> (2*v_im)) & 0x01010101) << 4;
-        uint32_t qs0_16_hi4_offset16 = ((qh >> (2*v_im)) & 0x02020202) << 3;
-        uint32_t qs64_80_lo4_offset16 = ((qh >> (2*v_im)) & 0x10101010) << 0;
-        uint32_t qs64_80_hi4_offset16 = ((qh >> (2*v_im)) & 0x20202020) >> 1;
-
-        qs0_16_u32_lo4 += qs0_16_lo4_offset16;
-        qs0_16_u32_hi4 += qs0_16_hi4_offset16;
-        qs64_80_u32_lo4 += qs64_80_lo4_offset16;
-        qs64_80_u32_hi4 += qs64_80_hi4_offset16;
-
-        uvec4 qs0_16_lo4 = uvec4(unpack8(qs0_16_u32_lo4));
-        uvec4 qs64_80_lo4 = uvec4(unpack8(qs64_80_u32_lo4));
-        uvec4 qs0_16_hi4 = uvec4(unpack8(qs0_16_u32_hi4));
-        uvec4 qs64_80_hi4 = uvec4(unpack8(qs64_80_u32_hi4));
-
-        const uint32_t q4_0  = qs0_16_lo4.x;
-        const uint32_t q4_1  = qs0_16_lo4.y;
-        const uint32_t q4_2  = qs0_16_lo4.z;
-        const uint32_t q4_3  = qs0_16_lo4.w;
-        const uint32_t q4_4  = qs0_16_hi4.x;
-        const uint32_t q4_5  = qs0_16_hi4.y;
-        const uint32_t q4_6  = qs0_16_hi4.z;
-        const uint32_t q4_7  = qs0_16_hi4.w;
-        const uint32_t q4_8  = qs64_80_lo4.x;
-        const uint32_t q4_9  = qs64_80_lo4.y;
-        const uint32_t q4_10 = qs64_80_lo4.z;
-        const uint32_t q4_11 = qs64_80_lo4.w;
-        const uint32_t q4_12 = qs64_80_hi4.x;
-        const uint32_t q4_13 = qs64_80_hi4.y;
-        const uint32_t q4_14 = qs64_80_hi4.z;
-        const uint32_t q4_15 = qs64_80_hi4.w;
-
         B_TYPE_VEC2 by10 =  data_b_v2[(b_offset + y1_idx) / 2];
         B_TYPE_VEC2 by116 = data_b_v2[(b_offset + y1_idx) / 2 + 8];
         B_TYPE_VEC2 by132 = data_b_v2[(b_offset + y1_idx) / 2 + 16];
@@ -116,45 +52,129 @@ void main() {
         B_TYPE_VEC2 by232 = data_b_v2[(b_offset + y2_idx) / 2 + 16];
         B_TYPE_VEC2 by248 = data_b_v2[(b_offset + y2_idx) / 2 + 24];
 
-        const FLOAT_TYPE sx =
-          fma(FLOAT_TYPE(by10.x), q4_0,
-          fma(FLOAT_TYPE(by10.y), q4_1,
-          fma(FLOAT_TYPE(by116.x), q4_2,
-             FLOAT_TYPE(by116.y) * q4_3)));
-        const FLOAT_TYPE sy =
-          fma(FLOAT_TYPE(by132.x), q4_4,
-          fma(FLOAT_TYPE(by132.y), q4_5,
-          fma(FLOAT_TYPE(by148.x), q4_6,
-             FLOAT_TYPE(by148.y) * q4_7)));
-        const FLOAT_TYPE sz =
-          fma(FLOAT_TYPE(by20.x), q4_8,
-          fma(FLOAT_TYPE(by20.y), q4_9,
-          fma(FLOAT_TYPE(by216.x), q4_10,
-             FLOAT_TYPE(by216.y) * q4_11)));
-        const FLOAT_TYPE sw =
-          fma(FLOAT_TYPE(by232.x), q4_12,
-          fma(FLOAT_TYPE(by232.y), q4_13,
-          fma(FLOAT_TYPE(by248.x), q4_14,
-             FLOAT_TYPE(by248.y) * q4_15)));
-        const FLOAT_TYPE smin =
-          fma(FLOAT_TYPE(by10.x) + FLOAT_TYPE(by10.y) + FLOAT_TYPE(by116.x) + FLOAT_TYPE(by116.y), sc2,
-          fma(FLOAT_TYPE(by132.x) + FLOAT_TYPE(by132.y) + FLOAT_TYPE(by148.x) + FLOAT_TYPE(by148.y), sc3,
-          fma(FLOAT_TYPE(by20.x) + FLOAT_TYPE(by20.y) + FLOAT_TYPE(by216.x) + FLOAT_TYPE(by216.y), sc6,
-              (FLOAT_TYPE(by232.x) + FLOAT_TYPE(by232.y) + FLOAT_TYPE(by248.x) + FLOAT_TYPE(by248.y)) * sc7)));
-        temp = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp));
+        [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+            const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
+            f16vec2 d = data_a[ib0 + i].d;
+            const FLOAT_TYPE dall = FLOAT_TYPE(d.x);
+            const FLOAT_TYPE dmin = FLOAT_TYPE(d.y);
+
+            uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im    ];
+            uint32_t scale4_u32 = data_a_packed16[ib0 + i].scales[v_im + 2];
+            uint32_t scale8_u32 = data_a_packed16[ib0 + i].scales[v_im + 4];
+            uvec4 scale0 = uvec4(unpack8(scale0_u32));
+            uvec4 scale4 = uvec4(unpack8(scale4_u32));
+            uvec4 scale8 = uvec4(unpack8(scale8_u32));
+
+            const uint32_t sc0 = (  scale0.x       & 0x3f);
+            const uint32_t sc1 = (  scale0.y       & 0x3f);
+            const uint32_t sc2 = (  scale4.x       & 0x3f);
+            const uint32_t sc3 = (  scale4.y       & 0x3f);
+            const uint32_t sc4 = (( scale8.x       & 0x0f) | ((scale0.x & 0xc0) >> 2));
+            const uint32_t sc5 = (( scale8.y       & 0x0f) | ((scale0.y & 0xc0) >> 2));
+            const uint32_t sc6 = (((scale8.x >> 4) & 0x0f) | ((scale4.x & 0xc0) >> 2));
+            const uint32_t sc7 = (((scale8.y >> 4) & 0x0f) | ((scale4.y & 0xc0) >> 2));
+
+            uint32_t qs0_16_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 8]) << 16);
+            uint32_t qs64_80_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 32]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 40]) << 16);
+
+            uint32_t qs0_16_u32_lo4 = qs0_16_u32 & 0x0F0F0F0F;
+            uint32_t qs0_16_u32_hi4 = (qs0_16_u32 >> 4) & 0x0F0F0F0F;
+            uint32_t qs64_80_u32_lo4 = qs64_80_u32 & 0x0F0F0F0F;
+            uint32_t qs64_80_u32_hi4 = (qs64_80_u32 >> 4) & 0x0F0F0F0F;
+
+            uint32_t qh = pack32(u16vec2(data_a_packed16[ib0 + i].qh[l0 / 2], data_a_packed16[ib0 + i].qh[l0 / 2 + 8]));
+
+            uint32_t qs0_16_lo4_offset16 = ((qh >> (2*v_im)) & 0x01010101) << 4;
+            uint32_t qs0_16_hi4_offset16 = ((qh >> (2*v_im)) & 0x02020202) << 3;
+            uint32_t qs64_80_lo4_offset16 = ((qh >> (2*v_im)) & 0x10101010) << 0;
+            uint32_t qs64_80_hi4_offset16 = ((qh >> (2*v_im)) & 0x20202020) >> 1;
+
+            qs0_16_u32_lo4 += qs0_16_lo4_offset16;
+            qs0_16_u32_hi4 += qs0_16_hi4_offset16;
+            qs64_80_u32_lo4 += qs64_80_lo4_offset16;
+            qs64_80_u32_hi4 += qs64_80_hi4_offset16;
+
+            uvec4 qs0_16_lo4 = uvec4(unpack8(qs0_16_u32_lo4));
+            uvec4 qs64_80_lo4 = uvec4(unpack8(qs64_80_u32_lo4));
+            uvec4 qs0_16_hi4 = uvec4(unpack8(qs0_16_u32_hi4));
+            uvec4 qs64_80_hi4 = uvec4(unpack8(qs64_80_u32_hi4));
+
+            const uint32_t q4_0  = qs0_16_lo4.x;
+            const uint32_t q4_1  = qs0_16_lo4.y;
+            const uint32_t q4_2  = qs0_16_lo4.z;
+            const uint32_t q4_3  = qs0_16_lo4.w;
+            const uint32_t q4_4  = qs0_16_hi4.x;
+            const uint32_t q4_5  = qs0_16_hi4.y;
+            const uint32_t q4_6  = qs0_16_hi4.z;
+            const uint32_t q4_7  = qs0_16_hi4.w;
+            const uint32_t q4_8  = qs64_80_lo4.x;
+            const uint32_t q4_9  = qs64_80_lo4.y;
+            const uint32_t q4_10 = qs64_80_lo4.z;
+            const uint32_t q4_11 = qs64_80_lo4.w;
+            const uint32_t q4_12 = qs64_80_hi4.x;
+            const uint32_t q4_13 = qs64_80_hi4.y;
+            const uint32_t q4_14 = qs64_80_hi4.z;
+            const uint32_t q4_15 = qs64_80_hi4.w;
+
+            const FLOAT_TYPE sx =
+              fma(FLOAT_TYPE(by10.x), q4_0,
+              fma(FLOAT_TYPE(by10.y), q4_1,
+              fma(FLOAT_TYPE(by116.x), q4_2,
+                 FLOAT_TYPE(by116.y) * q4_3)));
+            const FLOAT_TYPE sy =
+              fma(FLOAT_TYPE(by132.x), q4_4,
+              fma(FLOAT_TYPE(by132.y), q4_5,
+              fma(FLOAT_TYPE(by148.x), q4_6,
+                 FLOAT_TYPE(by148.y) * q4_7)));
+            const FLOAT_TYPE sz =
+              fma(FLOAT_TYPE(by20.x), q4_8,
+              fma(FLOAT_TYPE(by20.y), q4_9,
+              fma(FLOAT_TYPE(by216.x), q4_10,
+                 FLOAT_TYPE(by216.y) * q4_11)));
+            const FLOAT_TYPE sw =
+              fma(FLOAT_TYPE(by232.x), q4_12,
+              fma(FLOAT_TYPE(by232.y), q4_13,
+              fma(FLOAT_TYPE(by248.x), q4_14,
+                 FLOAT_TYPE(by248.y) * q4_15)));
+            const FLOAT_TYPE smin =
+              fma(FLOAT_TYPE(by10.x) + FLOAT_TYPE(by10.y) + FLOAT_TYPE(by116.x) + FLOAT_TYPE(by116.y), sc2,
+              fma(FLOAT_TYPE(by132.x) + FLOAT_TYPE(by132.y) + FLOAT_TYPE(by148.x) + FLOAT_TYPE(by148.y), sc3,
+              fma(FLOAT_TYPE(by20.x) + FLOAT_TYPE(by20.y) + FLOAT_TYPE(by216.x) + FLOAT_TYPE(by216.y), sc6,
+                  (FLOAT_TYPE(by232.x) + FLOAT_TYPE(by232.y) + FLOAT_TYPE(by248.x) + FLOAT_TYPE(by248.y)) * sc7)));
+            temp[n] = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp[n]));
+        }
     }
 
-    tmp[gl_LocalInvocationID.x] = temp;
-
     // sum up partial sums and write back result
+    [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+        tmpsh[n][tid] = temp[n];
+    }
     barrier();
-    [[unroll]] for (uint s = gl_WorkGroupSize.x/2; s > 0; s >>= 1) {
+    [[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) {
         if (tid < s) {
-            tmp[tid] += tmp[tid + s];
+            [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+                tmpsh[n][tid] += tmpsh[n][tid + s];
+            }
         }
         barrier();
     }
     if (tid == 0) {
-        data_d[d_offset + row] = D_TYPE(tmp[0]);
+        [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+            data_d[d_offset + first_row + n] = D_TYPE(tmpsh[n][0]);
+        }
+    }
+}
+
+void main() {
+    const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
+
+    // do NUM_ROWS at a time, unless there aren't enough remaining rows
+    if (first_row + NUM_ROWS <= p.stride_d) {
+        compute_outputs(first_row, NUM_ROWS);
+    } else {
+        if (first_row >= p.stride_d) {
+            return;
+        }
+        compute_outputs(first_row, p.stride_d - first_row);
     }
 }
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp
index 760aff85499..fab4ff5ff05 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp
@@ -7,21 +7,15 @@
 layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
 
 layout (constant_id = 0) const uint BLOCK_SIZE = 32;
+layout (constant_id = 1) const uint NUM_ROWS = 1;
 
-shared FLOAT_TYPE tmp[BLOCK_SIZE];
-
-void main() {
-    const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
-
-    if (row >= p.stride_d) {
-        return;
-    }
+shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE];
 
+void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
     uint a_offset, b_offset, d_offset;
     get_offsets(a_offset, b_offset, d_offset);
 
     const uint num_blocks_per_row = p.ncols / QUANT_K;
-    const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row;
 
     // 16 threads are used to process each block
     const uint it_size = gl_WorkGroupSize.x/16;
@@ -42,69 +36,95 @@ void main() {
     const uint s_offset  =  8*v_im + is;
     const uint y_offset = 128*v_im + l0;
 
-    FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp
-
-    [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
-        const uint y_idx   = i * QUANT_K + y_offset;
-
-        const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
-
-        FLOAT_TYPE scales[4];
-        scales[0] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]);
-        scales[1] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]);
-        scales[2] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]);
-        scales[3] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]);
-
-        uint32_t ql0_u32 =  uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 1]) << 16);
-        uint32_t ql32_u32 = uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 16]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 17]) << 16);
-
-        uint32_t ql0_u32_lo4 = ql0_u32 & 0x0F0F0F0F;
-        uint32_t ql0_u32_hi4 = (ql0_u32 >> 4) & 0x0F0F0F0F;
-        uint32_t ql32_u32_lo4 = ql32_u32 & 0x0F0F0F0F;
-        uint32_t ql32_u32_hi4 = (ql32_u32 >> 4) & 0x0F0F0F0F;
-
-        uint32_t qh_u32 = uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2 + 1]) << 16);
-        uint32_t qh0_u32 = (qh_u32 & 0x03030303) << 4;
-        uint32_t qh2_u32 = (qh_u32 & 0x0C0C0C0C) << 2;
-        uint32_t qh4_u32 = (qh_u32 & 0x30303030) << 0;
-        uint32_t qh6_u32 = (qh_u32 & 0xC0C0C0C0) >> 2;
+    FLOAT_TYPE temp[NUM_ROWS];
 
-        uint32_t q0_u32 = ql0_u32_lo4  | qh0_u32;
-        uint32_t q1_u32 = ql32_u32_lo4 | qh2_u32;
-        uint32_t q2_u32 = ql0_u32_hi4  | qh4_u32;
-        uint32_t q3_u32 = ql32_u32_hi4 | qh6_u32;
+    [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
+        temp[i] = FLOAT_TYPE(0);
+    }
 
-        uvec4 q0 = uvec4(unpack8(q0_u32));
-        uvec4 q1 = uvec4(unpack8(q1_u32));
-        uvec4 q2 = uvec4(unpack8(q2_u32));
-        uvec4 q3 = uvec4(unpack8(q3_u32));
+    [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
+        const uint y_idx = i * QUANT_K + y_offset;
 
         B_TYPE_VEC4 by0  = data_b_v4[(b_offset + y_idx) / 4];
         B_TYPE_VEC4 by32 = data_b_v4[(b_offset + y_idx) / 4 + 8];
         B_TYPE_VEC4 by64 = data_b_v4[(b_offset + y_idx) / 4 + 16];
         B_TYPE_VEC4 by96 = data_b_v4[(b_offset + y_idx) / 4 + 24];
 
-        FLOAT_TYPE sum = FLOAT_TYPE(0.0);
-        [[unroll]] for (int l = 0; l < 4; ++l) {
-            sum = fma(FLOAT_TYPE(by0[l])  * scales[0], FLOAT_TYPE(int8_t(q0[l]) - 32),
-                  fma(FLOAT_TYPE(by32[l]) * scales[1], FLOAT_TYPE(int8_t(q1[l]) - 32),
-                  fma(FLOAT_TYPE(by64[l]) * scales[2], FLOAT_TYPE(int8_t(q2[l]) - 32),
-                  fma(FLOAT_TYPE(by96[l]) * scales[3], FLOAT_TYPE(int8_t(q3[l]) - 32), sum))));
+        [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+            const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
+            const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
+
+            FLOAT_TYPE scales[4];
+            scales[0] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]);
+            scales[1] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]);
+            scales[2] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]);
+            scales[3] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]);
+
+            uint32_t ql0_u32 =  uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 1]) << 16);
+            uint32_t ql32_u32 = uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 16]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 17]) << 16);
+
+            uint32_t ql0_u32_lo4 = ql0_u32 & 0x0F0F0F0F;
+            uint32_t ql0_u32_hi4 = (ql0_u32 >> 4) & 0x0F0F0F0F;
+            uint32_t ql32_u32_lo4 = ql32_u32 & 0x0F0F0F0F;
+            uint32_t ql32_u32_hi4 = (ql32_u32 >> 4) & 0x0F0F0F0F;
+
+            uint32_t qh_u32 = uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2 + 1]) << 16);
+            uint32_t qh0_u32 = (qh_u32 & 0x03030303) << 4;
+            uint32_t qh2_u32 = (qh_u32 & 0x0C0C0C0C) << 2;
+            uint32_t qh4_u32 = (qh_u32 & 0x30303030) << 0;
+            uint32_t qh6_u32 = (qh_u32 & 0xC0C0C0C0) >> 2;
+
+            uint32_t q0_u32 = ql0_u32_lo4  | qh0_u32;
+            uint32_t q1_u32 = ql32_u32_lo4 | qh2_u32;
+            uint32_t q2_u32 = ql0_u32_hi4  | qh4_u32;
+            uint32_t q3_u32 = ql32_u32_hi4 | qh6_u32;
+
+            uvec4 q0 = uvec4(unpack8(q0_u32));
+            uvec4 q1 = uvec4(unpack8(q1_u32));
+            uvec4 q2 = uvec4(unpack8(q2_u32));
+            uvec4 q3 = uvec4(unpack8(q3_u32));
+
+            FLOAT_TYPE sum = FLOAT_TYPE(0.0);
+            [[unroll]] for (int l = 0; l < 4; ++l) {
+                sum = fma(FLOAT_TYPE(by0[l])  * scales[0], FLOAT_TYPE(int8_t(q0[l]) - 32),
+                      fma(FLOAT_TYPE(by32[l]) * scales[1], FLOAT_TYPE(int8_t(q1[l]) - 32),
+                      fma(FLOAT_TYPE(by64[l]) * scales[2], FLOAT_TYPE(int8_t(q2[l]) - 32),
+                      fma(FLOAT_TYPE(by96[l]) * scales[3], FLOAT_TYPE(int8_t(q3[l]) - 32), sum))));
+            }
+            temp[n] += sum * d;
         }
-        temp += sum * d;
     }
 
-    tmp[gl_LocalInvocationID.x] = temp;
     // sum up partial sums and write back result
-
+    [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+        tmpsh[n][tid] = temp[n];
+    }
     barrier();
-    [[unroll]] for (uint s = gl_WorkGroupSize.x/2; s > 0; s >>= 1) {
+    [[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) {
         if (tid < s) {
-            tmp[tid] += tmp[tid + s];
+            [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+                tmpsh[n][tid] += tmpsh[n][tid + s];
+            }
         }
         barrier();
     }
     if (tid == 0) {
-        data_d[d_offset + row] = D_TYPE(tmp[0]);
+        [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+            data_d[d_offset + first_row + n] = D_TYPE(tmpsh[n][0]);
+        }
+    }
+}
+
+void main() {
+    const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
+
+    // do NUM_ROWS at a time, unless there aren't enough remaining rows
+    if (first_row + NUM_ROWS <= p.stride_d) {
+        compute_outputs(first_row, NUM_ROWS);
+    } else {
+        if (first_row >= p.stride_d) {
+            return;
+        }
+        compute_outputs(first_row, p.stride_d - first_row);
     }
 }
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp
index 6e20b6411ca..a25808e1656 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp
@@ -32,7 +32,7 @@ shared FLOAT_TYPE vals[BLOCK_SIZE];
 void soft_max(uint num_iters) {
     const uint tid = gl_LocalInvocationID.x;
     const uint rowx = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
-    const uint rowy = rowx % p.KY;
+    const uint rowy = (p.KY > 0) ? (rowx % p.KY) : 0;
 
     if (rowx >= p.nrows_x) {
         return;
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
index c48a228aef6..8111c063884 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@@ -78,7 +78,8 @@ void execute_command(const std::string& command, std::string& stdout_str, std::s
     }
 
     PROCESS_INFORMATION pi;
-    STARTUPINFOA si = { sizeof(STARTUPINFOA) };
+    STARTUPINFOA si = {};
+    si.cb = sizeof(STARTUPINFOA);
     si.dwFlags = STARTF_USESTDHANDLES;
     si.hStdOutput = stdout_write;
     si.hStdError = stderr_write;
@@ -479,6 +480,8 @@ void process_shaders() {
 
     string_to_spv("pool2d_f32", "pool2d.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
 
+    string_to_spv("rwkv_wkv6_f32", "wkv6.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
+
     for (auto &c : compiles) {
         c.wait();
     }
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/wkv6.comp b/ggml/src/ggml-vulkan/vulkan-shaders/wkv6.comp
new file mode 100644
index 00000000000..35cc6c45f90
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/wkv6.comp
@@ -0,0 +1,87 @@
+#version 450
+
+#extension GL_EXT_control_flow_attributes : require
+
+#define BLOCK_SIZE 64
+layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
+
+layout(push_constant) uniform Parameters {
+    uint B;
+    uint T;
+    uint C;
+    uint H;
+};
+
+layout(binding = 0) readonly buffer KBuf { A_TYPE k[]; };
+layout(binding = 1) readonly buffer VBuf { A_TYPE v[]; };
+layout(binding = 2) readonly buffer RBuf { A_TYPE r[]; };
+layout(binding = 3) readonly buffer TimeFBuf { A_TYPE tf[]; };
+layout(binding = 4) readonly buffer TimeDBuf { A_TYPE td[]; };
+layout(binding = 5) readonly buffer StateBuf { A_TYPE state_in[]; };
+layout(binding = 6) buffer DstBuf { A_TYPE dst[]; };
+
+shared A_TYPE _k[BLOCK_SIZE], _r[BLOCK_SIZE], _tf[BLOCK_SIZE], _td[BLOCK_SIZE];
+
+void main() {
+    const uint head_size = BLOCK_SIZE;
+    const uint batch_id = gl_WorkGroupID.x / H;
+    const uint head_id = gl_WorkGroupID.x % H;
+    const uint tid = gl_LocalInvocationID.x;
+
+    const uint state_size = C * head_size;
+    const uint n_seq_tokens = T / B;
+
+    if (batch_id >= B || head_id >= H) {
+        return;
+    }
+
+    A_TYPE state[BLOCK_SIZE];
+    [[unroll]] for (uint i = 0; i < head_size; i++) {
+        state[i] = state_in[batch_id * state_size + head_id * head_size * head_size
+                          + i * head_size + tid];
+    }
+
+    barrier();
+    _tf[tid] = tf[head_id * head_size + tid];
+    barrier();
+
+    const uint start_t = batch_id * n_seq_tokens * C + head_id * head_size + tid;
+    const uint end_t = (batch_id + 1) * n_seq_tokens * C + head_id * head_size + tid;
+
+    for (uint t = start_t; t < end_t; t += C) {
+        barrier();
+        _k[tid] = k[t];
+        _r[tid] = r[t];
+        _td[tid] = td[t];
+        barrier();
+
+        const A_TYPE v_val = v[t];
+        A_TYPE y = 0.0;
+
+        [[unroll]] for (uint j = 0; j < head_size; j += 4) {
+            vec4 k_vec = vec4(_k[j], _k[j+1], _k[j+2], _k[j+3]);
+            vec4 r_vec = vec4(_r[j], _r[j+1], _r[j+2], _r[j+3]);
+            vec4 tf_vec = vec4(_tf[j], _tf[j+1], _tf[j+2], _tf[j+3]);
+            vec4 td_vec = vec4(_td[j], _td[j+1], _td[j+2], _td[j+3]);
+            vec4 s_vec = vec4(state[j], state[j+1], state[j+2], state[j+3]);
+
+            vec4 kv = k_vec * v_val;
+
+            vec4 temp = tf_vec * kv + s_vec;
+            y += dot(r_vec, temp);
+
+            s_vec = s_vec * td_vec + kv;
+            state[j] = s_vec.x;
+            state[j+1] = s_vec.y;
+            state[j+2] = s_vec.z;
+            state[j+3] = s_vec.w;
+        }
+
+        dst[t] = y;
+    }
+
+    [[unroll]] for (uint i = 0; i < head_size; i++) {
+        dst[T * C + batch_id * state_size + head_id * head_size * head_size
+            + i * head_size + tid] = state[i];
+    }
+}
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 39ecfdc8b71..08ed240706b 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -3546,15 +3546,18 @@ static struct ggml_tensor * ggml_rope_impl(
         GGML_ASSERT(c->ne[0] >= n_dims / 2);
     }
 
+    int sections[4] = {0, 0, 0, 0};
+
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
-    int32_t params[11] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
+    int32_t params[15] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
     memcpy(params +  5, &freq_base,    sizeof(float));
     memcpy(params +  6, &freq_scale,   sizeof(float));
     memcpy(params +  7, &ext_factor,   sizeof(float));
     memcpy(params +  8, &attn_factor,  sizeof(float));
     memcpy(params +  9, &beta_fast,    sizeof(float));
     memcpy(params + 10, &beta_slow,    sizeof(float));
+    memcpy(params + 11, &sections,     sizeof(int)*4);
     ggml_set_op_params(result, params, sizeof(params));
 
     result->op     = GGML_OP_ROPE;
@@ -3576,6 +3579,53 @@ struct ggml_tensor * ggml_rope(
     );
 }
 
+struct ggml_tensor * ggml_rope_multi(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        struct ggml_tensor  * c,
+        int                   n_dims,
+        int                   sections[4],
+        int                   mode,
+        int                   n_ctx_orig,
+        float                 freq_base,
+        float                 freq_scale,
+        float                 ext_factor,
+        float                 attn_factor,
+        float                 beta_fast,
+        float                 beta_slow) {
+    // Multimodal Rotary Position Embedding
+    GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported");
+
+    GGML_ASSERT(ggml_is_vector(b));
+    GGML_ASSERT(b->type == GGML_TYPE_I32);
+    GGML_ASSERT(a->ne[2] * 4 == b->ne[0]); // mrope expecting 4 position ids per token
+
+    if (c) {
+        GGML_ASSERT(c->type == GGML_TYPE_F32);
+        GGML_ASSERT(c->ne[0] >= n_dims / 2);
+    }
+
+    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
+
+    int32_t params[11 + 4] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
+    memcpy(params +  5, &freq_base,    sizeof(float));
+    memcpy(params +  6, &freq_scale,   sizeof(float));
+    memcpy(params +  7, &ext_factor,   sizeof(float));
+    memcpy(params +  8, &attn_factor,  sizeof(float));
+    memcpy(params +  9, &beta_fast,    sizeof(float));
+    memcpy(params + 10, &beta_slow,    sizeof(float));
+    memcpy(&params[11], sections,      sizeof(int)*4);
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op   = GGML_OP_ROPE;
+    result->src[0] = a;
+    result->src[1] = b;
+    result->src[2] = c;
+
+    return result;
+}
+
 struct ggml_tensor * ggml_rope_inplace(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
@@ -3739,13 +3789,84 @@ struct ggml_tensor * ggml_clamp(
     return result;
 }
 
-// ggml_conv_1d
-
 static int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
     return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
 }
 
-GGML_API struct ggml_tensor * ggml_conv_1d(
+// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
+// a: [OC，IC, KH, KW]
+// b: [N, IC, IH, IW]
+// result: [N, OH, OW, IC*KH*KW]
+struct ggml_tensor * ggml_im2col(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   s0,
+        int                   s1,
+        int                   p0,
+        int                   p1,
+        int                   d0,
+        int                   d1,
+        bool                  is_2D,
+        enum ggml_type        dst_type) {
+    if (is_2D) {
+        GGML_ASSERT(a->ne[2] == b->ne[2]);
+    } else {
+        //GGML_ASSERT(b->ne[1] % a->ne[1] == 0);
+        GGML_ASSERT(b->ne[1] == a->ne[1]);
+        GGML_ASSERT(b->ne[3] == 1);
+    }
+
+    const int64_t OH = is_2D ? ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1) : 0;
+    const int64_t OW =         ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
+
+    GGML_ASSERT((!is_2D || OH > 0) && "b too small compared to a");
+    GGML_ASSERT((OW > 0)           && "b too small compared to a");
+
+    const int64_t ne[4] = {
+        is_2D ? (a->ne[2] * a->ne[1] * a->ne[0]) : a->ne[1] * a->ne[0],
+        OW,
+        is_2D ? OH : b->ne[2],
+        is_2D ?      b->ne[3] : 1,
+    };
+
+    struct ggml_tensor * result = ggml_new_tensor(ctx, dst_type, 4, ne);
+    int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op     = GGML_OP_IM2COL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_im2col_back(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int64_t             * ne,
+        int                   s0,
+        int                   s1,
+        int                   p0,
+        int                   p1,
+        int                   d0,
+        int                   d1,
+        bool                  is_2D) {
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
+    int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op     = GGML_OP_IM2COL_BACK;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+// ggml_conv_1d
+
+struct ggml_tensor * ggml_conv_1d(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         struct ggml_tensor  * b,
@@ -3775,137 +3896,75 @@ struct ggml_tensor* ggml_conv_1d_ph(
     return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
 }
 
-// ggml_conv_transpose_1d
-
-static int64_t ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
-    return (ins - 1) * s - 2 * p + d * (ks - 1) + 1;
-}
+// ggml_conv_1d_dw
 
-GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
+struct ggml_tensor * ggml_conv_1d_dw(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         struct ggml_tensor  * b,
         int                   s0,
         int                   p0,
         int                   d0) {
-    GGML_ASSERT(ggml_is_matrix(b));
-    GGML_ASSERT(a->ne[2] == b->ne[1]);
-    GGML_ASSERT(a->ne[3] == 1);
+    struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], 1, a->ne[1], a->ne[2]);
+    struct ggml_tensor * new_b = ggml_reshape_4d(ctx, b, b->ne[0], 1, b->ne[1], b->ne[2]);
 
-    GGML_ASSERT(p0 == 0);
-    GGML_ASSERT(d0 == 1);
-
-    const int64_t ne[4] = {
-        ggml_calc_conv_transpose_1d_output_size(b->ne[0], a->ne[0], s0, 0 /*p0*/, 1 /*d0*/),
-        a->ne[1], b->ne[2], 1,
-    };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
+    struct ggml_tensor * im2col = ggml_im2col(ctx, new_a, new_b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16);
 
-    int32_t params[] = { s0, p0, d0 };
-    ggml_set_op_params(result, params, sizeof(params));
+    struct ggml_tensor * result = ggml_mul_mat(ctx, im2col, a);
 
-    result->op     = GGML_OP_CONV_TRANSPOSE_1D;
-    result->src[0] = a;
-    result->src[1] = b;
+    result = ggml_reshape_3d(ctx, result, b->ne[0], b->ne[1], 1);
 
     return result;
 }
 
-// ggml_conv_depthwise
+// ggml_conv_1d_dw_ph
 
-struct ggml_tensor * ggml_conv_depthwise_2d(
+struct ggml_tensor * ggml_conv_1d_dw_ph(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         struct ggml_tensor  * b,
         int                   s0,
-        int                   s1,
-        int                   p0,
-        int                   p1,
-        int                   d0,
-        int                   d1) {
-    struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
-    struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
-                                        ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
-                                        s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N * IC, OH, OW, KH * KW]
-    struct ggml_tensor * new_b = ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3]); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
+        int                   d0) {
+    return ggml_conv_1d_dw(ctx, a, b, s0, a->ne[0] / 2, d0);
+}
 
-    new_a = ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2],  new_a->ne[3], 1);                       // [OC，1, KH, KW] => [1, OC, 1, KH * KW]
-    struct ggml_tensor * result = ggml_mul_mat(ctx, new_a, new_b);
-    result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
+// ggml_conv_transpose_1d
 
-    return result;
+static int64_t ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
+    return (ins - 1) * s - 2 * p + d * (ks - 1) + 1;
 }
-// ggml_conv_2d
 
-// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
-// a: [OC，IC, KH, KW]
-// b: [N, IC, IH, IW]
-// result: [N, OH, OW, IC*KH*KW]
-struct ggml_tensor * ggml_im2col(
+GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         struct ggml_tensor  * b,
         int                   s0,
-        int                   s1,
         int                   p0,
-        int                   p1,
-        int                   d0,
-        int                   d1,
-        bool                  is_2D,
-        enum ggml_type        dst_type) {
-    if(is_2D) {
-        GGML_ASSERT(a->ne[2] == b->ne[2]);
-    } else {
-        GGML_ASSERT(a->ne[1] == b->ne[1]);
-        GGML_ASSERT(b->ne[3] == 1);
-    }
-
-    const int64_t OH = is_2D ? ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1) : 0;
-    const int64_t OW =         ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
+        int                   d0) {
+    GGML_ASSERT(ggml_is_matrix(b));
+    GGML_ASSERT(a->ne[2] == b->ne[1]);
+    GGML_ASSERT(a->ne[3] == 1);
 
-    GGML_ASSERT((!is_2D || OH > 0) && "b too small compared to a");
-    GGML_ASSERT((OW > 0)           && "b too small compared to a");
+    GGML_ASSERT(p0 == 0);
+    GGML_ASSERT(d0 == 1);
 
     const int64_t ne[4] = {
-        is_2D ? (a->ne[2] * a->ne[1] * a->ne[0]) : a->ne[1] * a->ne[0],
-        OW,
-        is_2D ? OH : b->ne[2],
-        is_2D ?      b->ne[3] : 1,
+        ggml_calc_conv_transpose_1d_output_size(b->ne[0], a->ne[0], s0, 0 /*p0*/, 1 /*d0*/),
+        a->ne[1], b->ne[2], 1,
     };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
 
-    struct ggml_tensor * result = ggml_new_tensor(ctx, dst_type, 4, ne);
-    int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
+    int32_t params[] = { s0, p0, d0 };
     ggml_set_op_params(result, params, sizeof(params));
 
-    result->op     = GGML_OP_IM2COL;
+    result->op     = GGML_OP_CONV_TRANSPOSE_1D;
     result->src[0] = a;
     result->src[1] = b;
 
     return result;
 }
 
-struct ggml_tensor * ggml_im2col_back(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        int64_t             * ne,
-        int                   s0,
-        int                   s1,
-        int                   p0,
-        int                   p1,
-        int                   d0,
-        int                   d1,
-        bool                  is_2D) {
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
-    int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
-    ggml_set_op_params(result, params, sizeof(params));
-
-    result->op     = GGML_OP_IM2COL_BACK;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
+// ggml_conv_2d
 
 // a: [OC，IC, KH, KW]
 // b: [N, IC, IH, IW]
@@ -3952,6 +4011,31 @@ struct ggml_tensor * ggml_conv_2d_s1_ph(
     return ggml_conv_2d(ctx, a, b, 1, 1, a->ne[0] / 2, a->ne[1] / 2, 1, 1);
 }
 
+// ggml_conv_2d_dw
+
+struct ggml_tensor * ggml_conv_2d_dw(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   s0,
+        int                   s1,
+        int                   p0,
+        int                   p1,
+        int                   d0,
+        int                   d1) {
+    struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
+    struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
+                                        ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
+                                        s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N * IC, OH, OW, KH * KW]
+    struct ggml_tensor * new_b = ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3]); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
+
+    new_a = ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2],  new_a->ne[3], 1);                       // [OC，1, KH, KW] => [1, OC, 1, KH * KW]
+    struct ggml_tensor * result = ggml_mul_mat(ctx, new_a, new_b);
+    result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
+
+    return result;
+}
+
 // ggml_conv_transpose_2d_p0
 
 static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {
@@ -6016,12 +6100,12 @@ struct ggml_tensor * ggml_graph_get_tensor(const struct ggml_cgraph * cgraph, co
 
 struct ggml_tensor * ggml_graph_get_grad(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
     const size_t igrad = ggml_hash_find(&cgraph->visited_hash_set, node);
-    return igrad != GGML_HASHSET_FULL && ggml_bitset_get(cgraph->visited_hash_set.used, igrad) ? cgraph->grads[igrad] : NULL;
+    return igrad != GGML_HASHSET_FULL && ggml_bitset_get(cgraph->visited_hash_set.used, igrad) && cgraph->grads ? cgraph->grads[igrad] : NULL;
 }
 
 struct ggml_tensor * ggml_graph_get_grad_acc(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
     const size_t igrad = ggml_hash_find(&cgraph->visited_hash_set, node);
-    return igrad != GGML_HASHSET_FULL && ggml_bitset_get(cgraph->visited_hash_set.used, igrad) ? cgraph->grad_accs[igrad] : NULL;
+    return igrad != GGML_HASHSET_FULL && ggml_bitset_get(cgraph->visited_hash_set.used, igrad) && cgraph->grad_accs ? cgraph->grad_accs[igrad] : NULL;
 }
 
 void ggml_graph_print(const struct ggml_cgraph * cgraph) {
@@ -6468,7 +6552,7 @@ struct gguf_context {
     void * data;
 };
 
-static size_t gguf_type_size(enum gguf_type type) {
+size_t gguf_type_size(enum gguf_type type) {
     GGML_ASSERT(0 <= type && type < GGUF_TYPE_COUNT);
     return GGUF_TYPE_SIZE[type];
 }
@@ -6596,13 +6680,7 @@ struct gguf_context * gguf_init_empty(void) {
     return ctx;
 }
 
-struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
-    FILE * file = ggml_fopen(fname, "rb");
-    if (!file) {
-        fprintf(stderr, "%s: failed to open '%s': '%s'\n", __func__, fname, strerror(errno));
-        return NULL;
-    }
-
+struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params) {
     // offset from start of file
     size_t offset = 0;
 
@@ -6615,7 +6693,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
         for (uint32_t i = 0; i < sizeof(magic); i++) {
             if (magic[i] != GGUF_MAGIC[i]) {
                 fprintf(stderr, "%s: invalid magic characters '%c%c%c%c'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
-                fclose(file);
                 return NULL;
             }
         }
@@ -6626,7 +6703,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
     struct gguf_context * ctx = calloc(1, sizeof(struct gguf_context));
     if (!ctx) {
         fprintf(stderr, "%s: failed to allocate memory for context\n", __func__);
-        fclose(file);
         return NULL;
     }
 
@@ -6644,7 +6720,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
 
         if (ctx->header.version == 1) {
             fprintf(stderr, "%s: GGUFv1 is no longer supported. please use a more up-to-date version\n", __func__);
-            fclose(file);
             gguf_free(ctx);
             return NULL;
         }
@@ -6657,7 +6732,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
 
         if (!ok) {
             fprintf(stderr, "%s: failed to read header\n", __func__);
-            fclose(file);
             gguf_free(ctx);
             return NULL;
         }
@@ -6667,12 +6741,13 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
     {
         const uint64_t n_kv = ctx->header.n_kv;
 
-        ctx->kv = calloc(n_kv, sizeof(struct gguf_kv));
-        if (!ctx->kv) {
-            fprintf(stderr, "%s: failed to allocate memory for kv pairs\n", __func__);
-            fclose(file);
-            gguf_free(ctx);
-            return NULL;
+        if (n_kv > 0) {
+            ctx->kv = calloc(n_kv, sizeof(struct gguf_kv));
+            if (!ctx->kv) {
+                fprintf(stderr, "%s: failed to allocate memory for kv pairs\n", __func__);
+                gguf_free(ctx);
+                return NULL;
+            }
         }
 
         for (uint64_t i = 0; i < n_kv; ++i) {
@@ -6719,7 +6794,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                                     // prevent from integer overflow in the malloc below
                                     if (kv->value.arr.n >= SIZE_MAX/gguf_type_size(kv->value.arr.type)) {
                                         fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
-                                        fclose(file);
                                         gguf_free(ctx);
                                         return NULL;
                                     }
@@ -6727,7 +6801,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                                     kv->value.arr.data = calloc(kv->value.arr.n, gguf_type_size(kv->value.arr.type));
                                     if (!kv->value.arr.data) {
                                         fprintf(stderr, "%s: failed to allocate memory for array\n", __func__);
-                                        fclose(file);
                                         gguf_free(ctx);
                                         return NULL;
                                     }
@@ -6739,7 +6812,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                                     // prevent from integer overflow in the malloc below
                                     if (kv->value.arr.n >= SIZE_MAX/sizeof(struct gguf_str)) {
                                         fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
-                                        fclose(file);
                                         gguf_free(ctx);
                                         return NULL;
                                     }
@@ -6747,7 +6819,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                                     kv->value.arr.data = calloc(kv->value.arr.n, sizeof(struct gguf_str));
                                     if (!kv->value.arr.data) {
                                         fprintf(stderr, "%s: failed to allocate memory for array\n", __func__);
-                                        fclose(file);
                                         gguf_free(ctx);
                                         return NULL;
                                     }
@@ -6778,7 +6849,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
 
         if (!ok) {
             fprintf(stderr, "%s: failed to read key-value pairs\n", __func__);
-            fclose(file);
             gguf_free(ctx);
             return NULL;
         }
@@ -6789,7 +6859,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
         ctx->infos = calloc(ctx->header.n_tensors, sizeof(struct gguf_tensor_info));
         if (!ctx->infos) {
             fprintf(stderr, "%s: failed to allocate memory for tensor infos\n", __func__);
-            fclose(file);
             gguf_free(ctx);
             return NULL;
         }
@@ -6825,7 +6894,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
 
             if (!ok) {
                 fprintf(stderr, "%s: failed to read tensor info\n", __func__);
-                fclose(file);
                 gguf_free(ctx);
                 return NULL;
             }
@@ -6868,7 +6936,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                 // this tensor type support have been removed:
                 fprintf(stderr, "%s: tensor '%s' of type %d: %s\n",
                         __func__, info->name.data, (int) info->type, ggml_type_name(info->type));
-                fclose(file);
                 gguf_free(ctx);
                 return NULL;
             }
@@ -6876,7 +6943,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
             if (ne % ggml_blck_size(info->type) != 0) {
                 fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n",
                         __func__, info->name.data, (int) info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type));
-                fclose(file);
                 gguf_free(ctx);
                 return NULL;
             }
@@ -6908,7 +6974,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
         *params.ctx = ggml_init(pdata);
         if (*params.ctx == NULL) {
             fprintf(stderr, "%s: failed to initialize context\n", __func__);
-            fclose(file);
             gguf_free(ctx);
             return NULL;
         }
@@ -6927,7 +6992,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
 
             if (!ok) {
                 fprintf(stderr, "%s: failed to read tensor data\n", __func__);
-                fclose(file);
                 ggml_free(ctx_data);
                 gguf_free(ctx);
                 return NULL;
@@ -6966,7 +7030,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
 
         if (!ok) {
             fprintf(stderr, "%s: failed to read the tensor data\n", __func__);
-            fclose(file);
             ggml_free(ctx_data);
             gguf_free(ctx);
             return NULL;
@@ -6975,11 +7038,21 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
         ggml_set_no_alloc(ctx_data, params.no_alloc);
     }
 
-    fclose(file);
-
     return ctx;
 }
 
+struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
+    FILE * file = ggml_fopen(fname, "rb");
+    if (!file) {
+        fprintf(stderr, "%s: failed to open '%s': '%s'\n", __func__, fname, strerror(errno));
+        return NULL;
+    }
+
+    struct gguf_context * result = gguf_init_from_file_impl(file, params);
+    fclose(file);
+    return result;
+}
+
 void gguf_free(struct gguf_context * ctx) {
     if (ctx == NULL) {
         return;
@@ -7439,13 +7512,7 @@ void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const vo
 //    fwrite(val, sizeof(char), size, file);
 //}
 
-struct gguf_buf {
-    void * data;
-    size_t size;
-    size_t offset;
-};
-
-static struct gguf_buf gguf_buf_init(size_t size) {
+struct gguf_buf gguf_buf_init(size_t size) {
     struct gguf_buf buf = {
         /*buf.data   =*/ size == 0 ? NULL : GGML_CALLOC(1, size),
         /*buf.size   =*/ size,
@@ -7455,7 +7522,7 @@ static struct gguf_buf gguf_buf_init(size_t size) {
     return buf;
 }
 
-static void gguf_buf_free(struct gguf_buf buf) {
+void gguf_buf_free(struct gguf_buf buf) {
     if (buf.data) {
         GGML_FREE(buf.data);
     }
@@ -7493,7 +7560,7 @@ static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_si
     buf->offset += el_size;
 }
 
-static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
+void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
     // write header
     gguf_bwrite_el(buf, &ctx->header.magic,     sizeof(ctx->header.magic));
     gguf_bwrite_el(buf, &ctx->header.version,   sizeof(ctx->header.version));
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 4c8710b39e8..273370370e6 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -90,6 +90,7 @@ class LLM:
         VOCAB_SIZE                        = "{arch}.vocab_size"
         CONTEXT_LENGTH                    = "{arch}.context_length"
         EMBEDDING_LENGTH                  = "{arch}.embedding_length"
+        FEATURES_LENGTH                   = "{arch}.features_length"
         BLOCK_COUNT                       = "{arch}.block_count"
         LEADING_DENSE_BLOCK_COUNT         = "{arch}.leading_dense_block_count"
         FEED_FORWARD_LENGTH               = "{arch}.feed_forward_length"
@@ -122,6 +123,8 @@ class Attention:
         VALUE_LENGTH      = "{arch}.attention.value_length"
         LAYERNORM_EPS     = "{arch}.attention.layer_norm_epsilon"
         LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
+        GROUPNORM_EPS     = "{arch}.attention.group_norm_epsilon"
+        GROUPNORM_GROUPS  = "{arch}.attention.group_norm_groups"
         CAUSAL            = "{arch}.attention.causal"
         Q_LORA_RANK       = "{arch}.attention.q_lora_rank"
         KV_LORA_RANK      = "{arch}.attention.kv_lora_rank"
@@ -131,6 +134,7 @@ class Attention:
 
     class Rope:
         DIMENSION_COUNT         = "{arch}.rope.dimension_count"
+        DIMENSION_SECTIONS      = "{arch}.rope.dimension_sections"
         FREQ_BASE               = "{arch}.rope.freq_base"
         SCALING_TYPE            = "{arch}.rope.scaling.type"
         SCALING_FACTOR          = "{arch}.rope.scaling.factor"
@@ -154,6 +158,14 @@ class SSM:
     class WKV:
         HEAD_SIZE = "{arch}.wkv.head_size"
 
+    class PosNet:
+        EMBEDDING_LENGTH = "{arch}.posnet.embedding_length"
+        BLOCK_COUNT      = "{arch}.posnet.block_count"
+
+    class ConvNext:
+        EMBEDDING_LENGTH = "{arch}.convnext.embedding_length"
+        BLOCK_COUNT      = "{arch}.convnext.block_count"
+
     class Tokenizer:
         MODEL                = "tokenizer.ggml.model"
         PRE                  = "tokenizer.ggml.pre"
@@ -208,56 +220,60 @@ class GGUFType:
 
 
 class MODEL_ARCH(IntEnum):
-    LLAMA        = auto()
-    FALCON       = auto()
-    BAICHUAN     = auto()
-    GROK         = auto()
-    GPT2         = auto()
-    GPTJ         = auto()
-    GPTNEOX      = auto()
-    MPT          = auto()
-    STARCODER    = auto()
-    REFACT       = auto()
-    BERT         = auto()
-    NOMIC_BERT   = auto()
-    JINA_BERT_V2 = auto()
-    BLOOM        = auto()
-    STABLELM     = auto()
-    QWEN         = auto()
-    QWEN2        = auto()
-    QWEN2MOE     = auto()
-    PHI2         = auto()
-    PHI3         = auto()
-    PLAMO        = auto()
-    CODESHELL    = auto()
-    ORION        = auto()
-    INTERNLM2    = auto()
-    MINICPM      = auto()
-    MINICPM3     = auto()
-    GEMMA        = auto()
-    GEMMA2       = auto()
-    STARCODER2   = auto()
-    RWKV6        = auto()
-    MAMBA        = auto()
-    XVERSE       = auto()
-    COMMAND_R    = auto()
-    DBRX         = auto()
-    OLMO         = auto()
-    OLMO2        = auto()
-    OLMOE        = auto()
-    OPENELM      = auto()
-    ARCTIC       = auto()
-    DEEPSEEK2    = auto()
-    CHATGLM      = auto()
-    BITNET       = auto()
-    T5           = auto()
-    T5ENCODER    = auto()
-    JAIS         = auto()
-    NEMOTRON     = auto()
-    EXAONE       = auto()
-    GRANITE      = auto()
-    GRANITE_MOE  = auto()
-    CHAMELEON    = auto()
+    LLAMA            = auto()
+    DECI             = auto()
+    FALCON           = auto()
+    BAICHUAN         = auto()
+    GROK             = auto()
+    GPT2             = auto()
+    GPTJ             = auto()
+    GPTNEOX          = auto()
+    MPT              = auto()
+    STARCODER        = auto()
+    REFACT           = auto()
+    BERT             = auto()
+    NOMIC_BERT       = auto()
+    JINA_BERT_V2     = auto()
+    BLOOM            = auto()
+    STABLELM         = auto()
+    QWEN             = auto()
+    QWEN2            = auto()
+    QWEN2MOE         = auto()
+    QWEN2VL          = auto()
+    PHI2             = auto()
+    PHI3             = auto()
+    PLAMO            = auto()
+    CODESHELL        = auto()
+    ORION            = auto()
+    INTERNLM2        = auto()
+    MINICPM          = auto()
+    MINICPM3         = auto()
+    GEMMA            = auto()
+    GEMMA2           = auto()
+    STARCODER2       = auto()
+    RWKV6            = auto()
+    MAMBA            = auto()
+    XVERSE           = auto()
+    COMMAND_R        = auto()
+    DBRX             = auto()
+    OLMO             = auto()
+    OLMO2            = auto()
+    OLMOE            = auto()
+    OPENELM          = auto()
+    ARCTIC           = auto()
+    DEEPSEEK         = auto()
+    DEEPSEEK2        = auto()
+    CHATGLM          = auto()
+    BITNET           = auto()
+    T5               = auto()
+    T5ENCODER        = auto()
+    JAIS             = auto()
+    NEMOTRON         = auto()
+    EXAONE           = auto()
+    GRANITE          = auto()
+    GRANITE_MOE      = auto()
+    CHAMELEON        = auto()
+    WAVTOKENIZER_DEC = auto()
 
 
 class MODEL_TENSOR(IntEnum):
@@ -367,59 +383,79 @@ class MODEL_TENSOR(IntEnum):
     ENC_OUTPUT_NORM      = auto()
     CLS                  = auto() # classifier
     CLS_OUT              = auto() # classifier output projection
+    CONV1D               = auto()
+    CONVNEXT_DW          = auto()
+    CONVNEXT_NORM        = auto()
+    CONVNEXT_PW1         = auto()
+    CONVNEXT_PW2         = auto()
+    CONVNEXT_GAMMA       = auto()
+    POSNET_CONV1         = auto()
+    POSNET_CONV2         = auto()
+    POSNET_NORM          = auto()
+    POSNET_NORM1         = auto()
+    POSNET_NORM2         = auto()
+    POSNET_ATTN_NORM     = auto()
+    POSNET_ATTN_Q        = auto()
+    POSNET_ATTN_K        = auto()
+    POSNET_ATTN_V        = auto()
+    POSNET_ATTN_OUT      = auto()
 
 
 MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
-    MODEL_ARCH.LLAMA:          "llama",
-    MODEL_ARCH.FALCON:         "falcon",
-    MODEL_ARCH.BAICHUAN:       "baichuan",
-    MODEL_ARCH.GROK:           "grok",
-    MODEL_ARCH.GPT2:           "gpt2",
-    MODEL_ARCH.GPTJ:           "gptj",
-    MODEL_ARCH.GPTNEOX:        "gptneox",
-    MODEL_ARCH.MPT:            "mpt",
-    MODEL_ARCH.STARCODER:      "starcoder",
-    MODEL_ARCH.REFACT:         "refact",
-    MODEL_ARCH.BERT:           "bert",
-    MODEL_ARCH.NOMIC_BERT:     "nomic-bert",
-    MODEL_ARCH.JINA_BERT_V2:   "jina-bert-v2",
-    MODEL_ARCH.BLOOM:          "bloom",
-    MODEL_ARCH.STABLELM:       "stablelm",
-    MODEL_ARCH.QWEN:           "qwen",
-    MODEL_ARCH.QWEN2:          "qwen2",
-    MODEL_ARCH.QWEN2MOE:       "qwen2moe",
-    MODEL_ARCH.PHI2:           "phi2",
-    MODEL_ARCH.PHI3:           "phi3",
-    MODEL_ARCH.PLAMO:          "plamo",
-    MODEL_ARCH.CODESHELL:      "codeshell",
-    MODEL_ARCH.ORION:          "orion",
-    MODEL_ARCH.INTERNLM2:      "internlm2",
-    MODEL_ARCH.MINICPM:        "minicpm",
-    MODEL_ARCH.MINICPM3:       "minicpm3",
-    MODEL_ARCH.GEMMA:          "gemma",
-    MODEL_ARCH.GEMMA2:         "gemma2",
-    MODEL_ARCH.STARCODER2:     "starcoder2",
-    MODEL_ARCH.RWKV6:          "rwkv6",
-    MODEL_ARCH.MAMBA:          "mamba",
-    MODEL_ARCH.XVERSE:         "xverse",
-    MODEL_ARCH.COMMAND_R:      "command-r",
-    MODEL_ARCH.DBRX:           "dbrx",
-    MODEL_ARCH.OLMO:           "olmo",
-    MODEL_ARCH.OLMO2:          "olmo2",
-    MODEL_ARCH.OLMOE:          "olmoe",
-    MODEL_ARCH.OPENELM:        "openelm",
-    MODEL_ARCH.ARCTIC:         "arctic",
-    MODEL_ARCH.DEEPSEEK2:      "deepseek2",
-    MODEL_ARCH.CHATGLM:        "chatglm",
-    MODEL_ARCH.BITNET:         "bitnet",
-    MODEL_ARCH.T5:             "t5",
-    MODEL_ARCH.T5ENCODER:      "t5encoder",
-    MODEL_ARCH.JAIS:           "jais",
-    MODEL_ARCH.NEMOTRON:       "nemotron",
-    MODEL_ARCH.EXAONE:         "exaone",
-    MODEL_ARCH.GRANITE:        "granite",
-    MODEL_ARCH.GRANITE_MOE:    "granitemoe",
-    MODEL_ARCH.CHAMELEON:      "chameleon",
+    MODEL_ARCH.LLAMA:            "llama",
+    MODEL_ARCH.DECI:             "deci",
+    MODEL_ARCH.FALCON:           "falcon",
+    MODEL_ARCH.BAICHUAN:         "baichuan",
+    MODEL_ARCH.GROK:             "grok",
+    MODEL_ARCH.GPT2:             "gpt2",
+    MODEL_ARCH.GPTJ:             "gptj",
+    MODEL_ARCH.GPTNEOX:          "gptneox",
+    MODEL_ARCH.MPT:              "mpt",
+    MODEL_ARCH.STARCODER:        "starcoder",
+    MODEL_ARCH.REFACT:           "refact",
+    MODEL_ARCH.BERT:             "bert",
+    MODEL_ARCH.NOMIC_BERT:       "nomic-bert",
+    MODEL_ARCH.JINA_BERT_V2:     "jina-bert-v2",
+    MODEL_ARCH.BLOOM:            "bloom",
+    MODEL_ARCH.STABLELM:         "stablelm",
+    MODEL_ARCH.QWEN:             "qwen",
+    MODEL_ARCH.QWEN2:            "qwen2",
+    MODEL_ARCH.QWEN2MOE:         "qwen2moe",
+    MODEL_ARCH.QWEN2VL:          "qwen2vl",
+    MODEL_ARCH.PHI2:             "phi2",
+    MODEL_ARCH.PHI3:             "phi3",
+    MODEL_ARCH.PLAMO:            "plamo",
+    MODEL_ARCH.CODESHELL:        "codeshell",
+    MODEL_ARCH.ORION:            "orion",
+    MODEL_ARCH.INTERNLM2:        "internlm2",
+    MODEL_ARCH.MINICPM:          "minicpm",
+    MODEL_ARCH.MINICPM3:         "minicpm3",
+    MODEL_ARCH.GEMMA:            "gemma",
+    MODEL_ARCH.GEMMA2:           "gemma2",
+    MODEL_ARCH.STARCODER2:       "starcoder2",
+    MODEL_ARCH.RWKV6:            "rwkv6",
+    MODEL_ARCH.MAMBA:            "mamba",
+    MODEL_ARCH.XVERSE:           "xverse",
+    MODEL_ARCH.COMMAND_R:        "command-r",
+    MODEL_ARCH.DBRX:             "dbrx",
+    MODEL_ARCH.OLMO:             "olmo",
+    MODEL_ARCH.OLMO2:            "olmo2",
+    MODEL_ARCH.OLMOE:            "olmoe",
+    MODEL_ARCH.OPENELM:          "openelm",
+    MODEL_ARCH.ARCTIC:           "arctic",
+    MODEL_ARCH.DEEPSEEK:         "deepseek",
+    MODEL_ARCH.DEEPSEEK2:        "deepseek2",
+    MODEL_ARCH.CHATGLM:          "chatglm",
+    MODEL_ARCH.BITNET:           "bitnet",
+    MODEL_ARCH.T5:               "t5",
+    MODEL_ARCH.T5ENCODER:        "t5encoder",
+    MODEL_ARCH.JAIS:             "jais",
+    MODEL_ARCH.NEMOTRON:         "nemotron",
+    MODEL_ARCH.EXAONE:           "exaone",
+    MODEL_ARCH.GRANITE:          "granite",
+    MODEL_ARCH.GRANITE_MOE:      "granitemoe",
+    MODEL_ARCH.CHAMELEON:        "chameleon",
+    MODEL_ARCH.WAVTOKENIZER_DEC: "wavtokenizer-dec",
 }
 
 TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@@ -529,6 +565,22 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.ENC_OUTPUT_NORM:           "enc.output_norm",
     MODEL_TENSOR.CLS:                       "cls",
     MODEL_TENSOR.CLS_OUT:                   "cls.output",
+    MODEL_TENSOR.CONV1D:                    "conv1d",
+    MODEL_TENSOR.CONVNEXT_DW:               "convnext.{bid}.dw",
+    MODEL_TENSOR.CONVNEXT_NORM:             "convnext.{bid}.norm",
+    MODEL_TENSOR.CONVNEXT_PW1:              "convnext.{bid}.pw1",
+    MODEL_TENSOR.CONVNEXT_PW2:              "convnext.{bid}.pw2",
+    MODEL_TENSOR.CONVNEXT_GAMMA:            "convnext.{bid}.gamma",
+    MODEL_TENSOR.POSNET_CONV1:              "posnet.{bid}.conv1",
+    MODEL_TENSOR.POSNET_CONV2:              "posnet.{bid}.conv2",
+    MODEL_TENSOR.POSNET_NORM:               "posnet.{bid}.norm",
+    MODEL_TENSOR.POSNET_NORM1:              "posnet.{bid}.norm1",
+    MODEL_TENSOR.POSNET_NORM2:              "posnet.{bid}.norm2",
+    MODEL_TENSOR.POSNET_ATTN_NORM:          "posnet.{bid}.attn_norm",
+    MODEL_TENSOR.POSNET_ATTN_Q:             "posnet.{bid}.attn_q",
+    MODEL_TENSOR.POSNET_ATTN_K:             "posnet.{bid}.attn_k",
+    MODEL_TENSOR.POSNET_ATTN_V:             "posnet.{bid}.attn_v",
+    MODEL_TENSOR.POSNET_ATTN_OUT:           "posnet.{bid}.attn_output",
 }
 
 MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@@ -552,6 +604,26 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_DOWN_EXP,
         MODEL_TENSOR.FFN_UP_EXP,
     ],
+    MODEL_ARCH.DECI: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+        MODEL_TENSOR.FFN_GATE_INP,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.FFN_GATE_EXP,
+        MODEL_TENSOR.FFN_DOWN_EXP,
+        MODEL_TENSOR.FFN_UP_EXP,
+    ],
     MODEL_ARCH.GROK: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
@@ -772,6 +844,20 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_DOWN,
         MODEL_TENSOR.FFN_UP,
     ],
+    MODEL_ARCH.QWEN2VL: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
     MODEL_ARCH.QWEN2MOE: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
@@ -1141,6 +1227,29 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_DOWN_EXP,
         MODEL_TENSOR.FFN_UP_EXP,
     ],
+    MODEL_ARCH.DEEPSEEK: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+        MODEL_TENSOR.FFN_GATE_INP,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.FFN_GATE_EXP,
+        MODEL_TENSOR.FFN_DOWN_EXP,
+        MODEL_TENSOR.FFN_UP_EXP,
+        MODEL_TENSOR.FFN_GATE_SHEXP,
+        MODEL_TENSOR.FFN_DOWN_SHEXP,
+        MODEL_TENSOR.FFN_UP_SHEXP,
+    ],
     MODEL_ARCH.DEEPSEEK2: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
@@ -1330,6 +1439,28 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_DOWN,
         MODEL_TENSOR.FFN_UP,
     ],
+    MODEL_ARCH.WAVTOKENIZER_DEC: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.TOKEN_EMBD_NORM,
+        MODEL_TENSOR.CONV1D,
+        MODEL_TENSOR.CONVNEXT_DW,
+        MODEL_TENSOR.CONVNEXT_NORM,
+        MODEL_TENSOR.CONVNEXT_PW1,
+        MODEL_TENSOR.CONVNEXT_PW2,
+        MODEL_TENSOR.CONVNEXT_GAMMA,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.POSNET_CONV1,
+        MODEL_TENSOR.POSNET_CONV2,
+        MODEL_TENSOR.POSNET_NORM,
+        MODEL_TENSOR.POSNET_NORM1,
+        MODEL_TENSOR.POSNET_NORM2,
+        MODEL_TENSOR.POSNET_ATTN_NORM,
+        MODEL_TENSOR.POSNET_ATTN_Q,
+        MODEL_TENSOR.POSNET_ATTN_K,
+        MODEL_TENSOR.POSNET_ATTN_V,
+        MODEL_TENSOR.POSNET_ATTN_OUT,
+    ],
     # TODO
 }
 
@@ -1339,6 +1470,10 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.ROPE_FREQS,
         MODEL_TENSOR.ATTN_ROT_EMBD,
     ],
+    MODEL_ARCH.DECI: [
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+    ],
     MODEL_ARCH.BAICHUAN: [
         MODEL_TENSOR.ROPE_FREQS,
         MODEL_TENSOR.ATTN_ROT_EMBD,
@@ -1363,6 +1498,10 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.ROPE_FREQS,
         MODEL_TENSOR.ATTN_ROT_EMBD,
     ],
+    MODEL_ARCH.DEEPSEEK: [
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+    ],
     MODEL_ARCH.DEEPSEEK2: [
         MODEL_TENSOR.ROPE_FREQS,
         MODEL_TENSOR.ATTN_ROT_EMBD,
diff --git a/gguf-py/gguf/gguf_reader.py b/gguf-py/gguf/gguf_reader.py
index e8e61abf86a..e17a4e83147 100644
--- a/gguf-py/gguf/gguf_reader.py
+++ b/gguf-py/gguf/gguf_reader.py
@@ -145,11 +145,10 @@ def _get(
         count = int(count)
         itemsize = int(np.empty([], dtype = dtype).itemsize)
         end_offs = offset + itemsize * count
-        return (
-            self.data[offset:end_offs]
-            .view(dtype = dtype)[:count]
-            .newbyteorder(override_order or self.byte_order)
-        )
+        arr = self.data[offset:end_offs].view(dtype=dtype)[:count]
+        if override_order is None:
+            return arr
+        return arr.view(arr.dtype.newbyteorder(override_order))
 
     def _push_field(self, field: ReaderField, skip_sum: bool = False) -> int:
         if field.name in self.fields:
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index 7a55d129653..3023b539ae8 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -631,6 +631,21 @@ def add_context_length(self, length: int) -> None:
     def add_embedding_length(self, length: int) -> None:
         self.add_uint32(Keys.LLM.EMBEDDING_LENGTH.format(arch=self.arch), length)
 
+    def add_features_length(self, length: int) -> None:
+        self.add_uint32(Keys.LLM.FEATURES_LENGTH.format(arch=self.arch), length)
+
+    def add_posnet_embedding_length(self, length: int) -> None:
+        self.add_uint32(Keys.PosNet.EMBEDDING_LENGTH.format(arch=self.arch), length)
+
+    def add_posnet_block_count(self, length: int) -> None:
+        self.add_uint32(Keys.PosNet.BLOCK_COUNT.format(arch=self.arch), length)
+
+    def add_convnext_embedding_length(self, length: int) -> None:
+        self.add_uint32(Keys.ConvNext.EMBEDDING_LENGTH.format(arch=self.arch), length)
+
+    def add_convnext_block_count(self, length: int) -> None:
+        self.add_uint32(Keys.ConvNext.BLOCK_COUNT.format(arch=self.arch), length)
+
     def add_block_count(self, length: int) -> None:
         self.add_uint32(Keys.LLM.BLOCK_COUNT.format(arch=self.arch), length)
 
@@ -727,6 +742,12 @@ def add_layer_norm_eps(self, value: float) -> None:
     def add_layer_norm_rms_eps(self, value: float) -> None:
         self.add_float32(Keys.Attention.LAYERNORM_RMS_EPS.format(arch=self.arch), value)
 
+    def add_group_norm_eps(self, value: float) -> None:
+        self.add_float32(Keys.Attention.GROUPNORM_EPS.format(arch=self.arch), value)
+
+    def add_group_norm_groups(self, value: int) -> None:
+        self.add_uint32(Keys.Attention.GROUPNORM_GROUPS.format(arch=self.arch), value)
+
     def add_causal_attention(self, value: bool) -> None:
         self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value)
 
@@ -751,6 +772,9 @@ def add_pooling_type(self, value: PoolingType) -> None:
     def add_rope_dimension_count(self, count: int) -> None:
         self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count)
 
+    def add_rope_dimension_sections(self, dims: Sequence[int]) -> None:
+        self.add_array(Keys.Rope.DIMENSION_SECTIONS.format(arch=self.arch), dims)
+
     def add_rope_freq_base(self, value: float) -> None:
         self.add_float32(Keys.Rope.FREQ_BASE.format(arch=self.arch), value)
 
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index f0a7b647850..7009a11d46b 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -42,6 +42,7 @@ class TensorNameMap:
             "emb_ln",                     # nomic-bert
             "transformer.norm",           # openelm
             "rwkv.blocks.0.pre_ln",       # rwkv
+            "backbone.norm",              # wavtokenizer
         ),
 
         # Position embeddings
@@ -60,6 +61,7 @@ class TensorNameMap:
             "lm_head.linear",            # phi2
             "output_layer",              # chatglm
             "head",                      # rwkv
+            "head.out",                  # wavtokenizer
         ),
 
         # Output norm
@@ -80,6 +82,7 @@ class TensorNameMap:
             "transformer.norm",                        # openelm
             "model.norm",                              # nemotron
             "rwkv.ln_out",                             # rwkv
+            "backbone.final_layer_norm",               # wavtokenizer
         ),
 
         # Rope frequencies
@@ -90,6 +93,10 @@ class TensorNameMap:
 
         MODEL_TENSOR.ROPE_FACTORS_LONG: (),
         MODEL_TENSOR.ROPE_FACTORS_SHORT: (),
+
+        MODEL_TENSOR.CONV1D: (
+            "backbone.embed", # roberta
+        ),
     }
 
     block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
@@ -191,6 +198,7 @@ class TensorNameMap:
             "transformer.h.{bid}.self_attention.dense",                     # falcon
             "h.{bid}.self_attention.dense",                                 # bloom
             "model.layers.{bid}.self_attn.o_proj",                          # llama-hf nemotron olmoe olmo2
+            "model.layers.{bid}.self_attn.linear_attn",                     # deci
             "layers.{bid}.attention.wo",                                    # llama-pth
             "encoder.layer.{bid}.attention.output.dense",                   # bert
             "transformer.h.{bid}.attn.out_proj",                            # gpt-j
@@ -306,7 +314,7 @@ class TensorNameMap:
 
         MODEL_TENSOR.FFN_UP_SHEXP: (
             "model.layers.{bid}.mlp.shared_expert.up_proj",  # qwen2moe
-            "model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek2
+            "model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek deepseek2
         ),
 
         # AWQ-activation gate
@@ -338,7 +346,7 @@ class TensorNameMap:
 
         MODEL_TENSOR.FFN_GATE_SHEXP: (
             "model.layers.{bid}.mlp.shared_expert.gate_proj",  # qwen2moe
-            "model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek2
+            "model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek deepseek2
         ),
 
         # Feed-forward down
@@ -379,7 +387,7 @@ class TensorNameMap:
 
         MODEL_TENSOR.FFN_DOWN_SHEXP: (
             "model.layers.{bid}.mlp.shared_expert.down_proj",  # qwen2moe
-            "model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek2
+            "model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek deepseek2
         ),
 
         MODEL_TENSOR.ATTN_Q_NORM: (
@@ -681,6 +689,8 @@ class TensorNameMap:
             "encoder.block.{bid}.layer.1.DenseReluDense.wo", # t5
         ),
 
+        ############################################################################
+        # TODO: these do not belong to block_mappings_cfg - move them to mappings_cfg
         MODEL_TENSOR.ENC_OUTPUT_NORM: (
             "encoder.final_layer_norm", # t5
         ),
@@ -693,6 +703,67 @@ class TensorNameMap:
         MODEL_TENSOR.CLS_OUT: (
             "classifier.out_proj", # roberta
         ),
+        #############################################################################
+
+        MODEL_TENSOR.CONVNEXT_DW: (
+            "backbone.convnext.{bid}.dwconv", # wavtokenizer
+        ),
+
+        MODEL_TENSOR.CONVNEXT_NORM: (
+            "backbone.convnext.{bid}.norm", # wavtokenizer
+        ),
+
+        MODEL_TENSOR.CONVNEXT_PW1: (
+            "backbone.convnext.{bid}.pwconv1", # wavtokenizer
+        ),
+
+        MODEL_TENSOR.CONVNEXT_PW2: (
+            "backbone.convnext.{bid}.pwconv2", # wavtokenizer
+        ),
+
+        MODEL_TENSOR.CONVNEXT_GAMMA: (
+            "backbone.convnext.{bid}.gamma", # wavtokenizer
+        ),
+
+        MODEL_TENSOR.POSNET_CONV1: (
+            "backbone.posnet.{bid}.conv1", # wavtokenizer
+        ),
+
+        MODEL_TENSOR.POSNET_CONV2: (
+            "backbone.posnet.{bid}.conv2", # wavtokenizer
+        ),
+
+        MODEL_TENSOR.POSNET_NORM: (
+            "backbone.posnet.{bid}.norm", # wavtokenizer
+        ),
+
+        MODEL_TENSOR.POSNET_NORM1: (
+            "backbone.posnet.{bid}.norm1", # wavtokenizer
+        ),
+
+        MODEL_TENSOR.POSNET_NORM2: (
+            "backbone.posnet.{bid}.norm2", # wavtokenizer
+        ),
+
+        MODEL_TENSOR.POSNET_ATTN_NORM: (
+            "backbone.posnet.{bid}.norm", # wavtokenizer
+        ),
+
+        MODEL_TENSOR.POSNET_ATTN_Q: (
+            "backbone.posnet.{bid}.q", # wavtokenizer
+        ),
+
+        MODEL_TENSOR.POSNET_ATTN_K: (
+            "backbone.posnet.{bid}.k", # wavtokenizer
+        ),
+
+        MODEL_TENSOR.POSNET_ATTN_V: (
+            "backbone.posnet.{bid}.v", # wavtokenizer
+        ),
+
+        MODEL_TENSOR.POSNET_ATTN_OUT: (
+            "backbone.posnet.{bid}.proj_out", # wavtokenizer
+        ),
     }
 
     # architecture-specific block mappings
diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml
index 33cfe26b7fe..9c39562560c 100644
--- a/gguf-py/pyproject.toml
+++ b/gguf-py/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "gguf"
-version = "0.10.0"
+version = "0.13.0"
 description = "Read and write ML models in GGUF for GGML"
 authors = ["GGML <ggml@ggml.ai>"]
 packages = [
diff --git a/gguf-py/tests/test_quants.py b/gguf-py/tests/test_quants.py
index 76206781422..f04d5acce27 100755
--- a/gguf-py/tests/test_quants.py
+++ b/gguf-py/tests/test_quants.py
@@ -136,7 +136,7 @@ def compare_tensors(t1: np.ndarray, t2: np.ndarray, qtype: GGMLQuantizationType)
         logger.debug(f"Sample bad block ({diff_bits[bad_block_id]} differing bits):\n{t1[bad_block_id]}\nReference:\n{t2[bad_block_id]}")
 
         sum_diff_bits = np.sum(diff_bits)
-        logger.debug(f"{sum_diff_bits} bits differ ({100 * sum_diff_bits/(x.size * 8):.6f}%)")
+        logger.debug(f"{sum_diff_bits} bits differ ({100 * sum_diff_bits / (x.size * 8):.6f}%)")
         return False
 
 
diff --git a/include/llama.h b/include/llama.h
index 55fcb6a137f..18feff99918 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -108,9 +108,11 @@ extern "C" {
     };
 
     enum llama_rope_type {
-        LLAMA_ROPE_TYPE_NONE = -1,
-        LLAMA_ROPE_TYPE_NORM = 0,
-        LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX,
+        LLAMA_ROPE_TYPE_NONE   = -1,
+        LLAMA_ROPE_TYPE_NORM   = 0,
+        LLAMA_ROPE_TYPE_NEOX   = GGML_ROPE_TYPE_NEOX,
+        LLAMA_ROPE_TYPE_MROPE  = GGML_ROPE_TYPE_MROPE,
+        LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION,
     };
 
     enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
@@ -456,6 +458,7 @@ extern "C" {
     // Functions to access the model's GGUF metadata scalar values
     // - The functions return the length of the string on success, or -1 on failure
     // - The output string is always null-terminated and cleared on failure
+    // - When retrieving a string, an extra byte must be allocated to account for the null terminator
     // - GGUF array values are not supported by these functions
 
     // Get metadata value as a string by key name
@@ -479,9 +482,6 @@ extern "C" {
     // Returns the total number of parameters in the model
     LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
 
-    // Get a llama model tensor
-    LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
-
     // Returns true if the model contains an encoder that requires llama_encode() call
     LLAMA_API bool llama_model_has_encoder(const struct llama_model * model);
 
@@ -1136,16 +1136,12 @@ extern "C" {
                           const char * grammar_str,
                           const char * grammar_root);
 
+    /// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first.
     LLAMA_API struct llama_sampler * llama_sampler_init_penalties(
-                             int32_t   n_vocab,         // llama_n_vocab()
-                         llama_token   special_eos_id,  // llama_token_eos()
-                         llama_token   linefeed_id,     // llama_token_nl()
-                             int32_t   penalty_last_n,  // last n tokens to penalize (0 = disable penalty, -1 = context size)
-                               float   penalty_repeat,  // 1.0 = disabled
-                               float   penalty_freq,    // 0.0 = disabled
-                               float   penalty_present, // 0.0 = disabled
-                                bool   penalize_nl,     // consider newlines as a repeatable token
-                                bool   ignore_eos);     // ignore the end-of-sequence token
+                             int32_t   penalty_last_n,   // last n tokens to penalize (0 = disable penalty, -1 = context size)
+                               float   penalty_repeat,   // 1.0 = disabled
+                               float   penalty_freq,     // 0.0 = disabled
+                               float   penalty_present); // 0.0 = disabled
 
     ///  @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982
     LLAMA_API struct llama_sampler *    llama_sampler_init_dry(
diff --git a/scripts/compare-commits.sh b/scripts/compare-commits.sh
index 143d987290d..e40d1cc6d98 100755
--- a/scripts/compare-commits.sh
+++ b/scripts/compare-commits.sh
@@ -20,11 +20,13 @@ if [ -n "$GGML_CUDA" ]; then
     cmake_opts="-DGGML_CUDA=ON"
 fi
 
+dir="build-bench"
+
 function run {
-    rm -fr build > /dev/null
-    cmake -B build -S . $cmake_opts > /dev/null
-    cmake --build build -t llama-bench > /dev/null
-    build/bin/llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite
+    rm -fr ${dir} > /dev/null
+    cmake -B ${dir} -S . $cmake_opts > /dev/null
+    cmake --build ${dir} -t llama-bench > /dev/null
+    ${dir}/bin/llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite
 }
 
 git checkout $1 > /dev/null
diff --git a/scripts/compare-llama-bench.py b/scripts/compare-llama-bench.py
index 5069ae63827..239c458d8b9 100755
--- a/scripts/compare-llama-bench.py
+++ b/scripts/compare-llama-bench.py
@@ -126,6 +126,8 @@
 cursor = connection.cursor()
 builds = cursor.execute("SELECT DISTINCT build_commit FROM test;").fetchall()
 
+commit_short_len = len(builds[0][0])
+
 try:
     repo = git.Repo(".", search_parent_directories=True)
 except git.InvalidGitRepositoryError:
@@ -138,11 +140,11 @@ def find_parent_in_data(commit: git.Commit):
     seen_hexsha8 = set()
     while heap:
         depth, current_commit = heapq.heappop(heap)
-        current_hexsha8 = commit.hexsha[:8]
+        current_hexsha8 = commit.hexsha[:commit_short_len]
         if (current_hexsha8,) in builds:
             return current_hexsha8
         for parent in commit.parents:
-            parent_hexsha8 = parent.hexsha[:8]
+            parent_hexsha8 = parent.hexsha[:commit_short_len]
             if parent_hexsha8 not in seen_hexsha8:
                 seen_hexsha8.add(parent_hexsha8)
                 heapq.heappush(heap, (depth + 1, parent))
@@ -156,9 +158,9 @@ def get_all_parent_hexsha8s(commit: git.Commit):
 
     while unvisited:
         current_commit = unvisited.pop(0)
-        visited.append(current_commit.hexsha[:8])
+        visited.append(current_commit.hexsha[:commit_short_len])
         for parent in current_commit.parents:
-            if parent.hexsha[:8] not in visited:
+            if parent.hexsha[:commit_short_len] not in visited:
                 unvisited.append(parent)
 
     return visited
@@ -169,10 +171,10 @@ def get_commit_name(hexsha8):
     if repo is None:
         return hexsha8
     for h in repo.heads:
-        if h.commit.hexsha[:8] == hexsha8:
+        if h.commit.hexsha[:commit_short_len] == hexsha8:
             return h.name
     for t in repo.tags:
-        if t.commit.hexsha[:8] == hexsha8:
+        if t.commit.hexsha[:commit_short_len] == hexsha8:
             return t.name
     return hexsha8
 
@@ -183,13 +185,13 @@ def get_commit_hexsha8(name):
         return None
     for h in repo.heads:
         if h.name == name:
-            return h.commit.hexsha[:8]
+            return h.commit.hexsha[:commit_short_len]
     for t in repo.tags:
         if t.name == name:
-            return t.commit.hexsha[:8]
+            return t.commit.hexsha[:commit_short_len]
     for c in repo.iter_commits("--all"):
-        if c.hexsha[:8] == name[:8]:
-            return c.hexsha[:8]
+        if c.hexsha[:commit_short_len] == name[:commit_short_len]:
+            return c.hexsha[:commit_short_len]
     return None
 
 
diff --git a/scripts/hf.sh b/scripts/hf.sh
index 85c2c4d9a95..b251925fa45 100755
--- a/scripts/hf.sh
+++ b/scripts/hf.sh
@@ -26,7 +26,7 @@ function has_cmd {
 }
 
 if has_cmd wget; then
-    cmd="wget -q --show-progress -c -O %s/%s %s"
+    cmd="wget -q -c -O %s/%s %s"
 elif has_cmd curl; then
     cmd="curl -C - -f --output-dir %s -o %s -L %s"
 else
diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index 47eae44f7d7..b4ac38bbf46 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-74d66b63eaf207a24f3e93bb922aba131cbf2906
+e6d93f40dffe8733d5d72f1d8fa6b3ca27ae899f
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index f3b3908b112..2d3ea099457 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,10 +1,3 @@
-# TODO: should not use this
-if (WIN32)
-    if (BUILD_SHARED_LIBS)
-        set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
-    endif()
-endif()
-
 llama_add_compile_flags()
 
 #
diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp
index 74e9f64b393..76d0cb3a2ff 100644
--- a/src/llama-grammar.cpp
+++ b/src/llama-grammar.cpp
@@ -822,15 +822,11 @@ llama_grammar_stacks & llama_grammar_get_stacks(struct llama_grammar * grammar)
     return grammar->stacks;
 }
 
-void llama_grammar_accept(
-        const llama_grammar_rules  & rules,
-        const llama_grammar_stacks & stacks,
-        const uint32_t               chr,
-              llama_grammar_stacks & stacks_new) {
-    stacks_new.clear();
-    stacks_new.reserve(stacks.size());
+void llama_grammar_accept(struct llama_grammar * grammar, uint32_t chr) {
+    llama_grammar_stacks stacks_new;
+    stacks_new.reserve(grammar->stacks.size());
 
-    for (const auto & stack : stacks) {
+    for (const auto & stack : grammar->stacks) {
         if (stack.empty()) {
             continue;
         }
@@ -844,9 +840,11 @@ void llama_grammar_accept(
             if (!llama_grammar_is_end_of_sequence(pos)) {
                 new_stack.push_back(pos);
             }
-            llama_grammar_advance_stack(rules, new_stack, stacks_new);
+            llama_grammar_advance_stack(grammar->rules, new_stack, stacks_new);
         }
     }
+
+    grammar->stacks = std::move(stacks_new);
 }
 
 llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
@@ -1051,7 +1049,12 @@ void llama_grammar_free_impl(struct llama_grammar * grammar) {
 }
 
 struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & grammar) {
-    llama_grammar * result = new llama_grammar { grammar.vocab, grammar.rules, grammar.stacks, grammar.partial_utf8, };
+    llama_grammar * result = new llama_grammar {
+        grammar.vocab,
+        grammar.rules,
+        grammar.stacks,
+        grammar.partial_utf8,
+    };
 
     // redirect elements in stacks to point to new rules
     for (size_t is = 0; is < result->stacks.size(); is++) {
@@ -1059,7 +1062,7 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
             for (size_t ir0 = 0; ir0 < grammar.rules.size(); ir0++) {
                 for (size_t ir1 = 0; ir1 < grammar.rules[ir0].size(); ir1++) {
                     if (grammar.stacks[is][ie] == &grammar.rules[ir0][ir1]) {
-                         result->stacks[is][ie]  =  &result->rules[ir0][ir1];
+                        result->stacks[is][ie] =  &result->rules[ir0][ir1];
                     }
                 }
             }
@@ -1126,11 +1129,8 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
     const auto   decoded     = decode_utf8(piece, grammar.partial_utf8);
     const auto & code_points = decoded.first;
 
-    llama_grammar_stacks stacks_new;
-
     for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
-        llama_grammar_accept(grammar.rules, grammar.stacks, *it, stacks_new);
-        grammar.stacks = std::move(stacks_new);
+        llama_grammar_accept(&grammar, *it);
     }
 
     grammar.partial_utf8 = decoded.second;
diff --git a/src/llama-grammar.h b/src/llama-grammar.h
index f529ce351e4..13e940fb52e 100644
--- a/src/llama-grammar.h
+++ b/src/llama-grammar.h
@@ -58,6 +58,7 @@ using llama_grammar_rules      = std::vector<llama_grammar_rule>;
 using llama_grammar_stacks     = std::vector<llama_grammar_stack>;
 using llama_grammar_candidates = std::vector<llama_grammar_candidate>;
 
+// TODO: remove, needed for tests atm
 const llama_grammar_rules  & llama_grammar_get_rules (const struct llama_grammar * grammar);
       llama_grammar_stacks & llama_grammar_get_stacks(      struct llama_grammar * grammar);
 
@@ -65,11 +66,7 @@ const llama_grammar_rules  & llama_grammar_get_rules (const struct llama_grammar
 // be positioned at a character range (see `llama_grammar_advance_stack`), and
 // produces the N possible stacks if the given char is accepted at those
 // positions
-void llama_grammar_accept(
-        const llama_grammar_rules  & rules,
-        const llama_grammar_stacks & stacks,
-                          uint32_t   chr,
-              llama_grammar_stacks & stacks_new);
+void llama_grammar_accept(struct llama_grammar * grammar, uint32_t chr);
 
 std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
         const llama_grammar_rules      & rules,
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index ecf0e929621..73ea5e66eac 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1396,19 +1396,15 @@ struct llama_sampler * llama_sampler_init_grammar_impl(const struct llama_vocab
 // penalties
 
 struct llama_sampler_penalties {
-    const int32_t     n_vocab;
-    const llama_token special_eos_id;
-    const llama_token linefeed_id;
-
     const int32_t penalty_last_n;
     const float   penalty_repeat;
     const float   penalty_freq;
     const float   penalty_present;
 
-    const bool    penalize_nl;
-    const bool    ignore_eos;
-
     ring_buffer<llama_token> prev;
+
+    // a frequency map to count token occurrences
+    std::unordered_map<llama_token, int> token_count;
 };
 
 static const char * llama_sampler_penalties_name(const struct llama_sampler * /*smpl*/) {
@@ -1421,76 +1417,50 @@ static void llama_sampler_penalties_accept(struct llama_sampler * smpl, llama_to
         return;
     }
 
-    ctx->prev.push_back(token);
-}
-
-static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-    auto * ctx = (llama_sampler_penalties *) smpl->ctx;
+    ctx->token_count[token]++;
 
-    if (ctx->ignore_eos) {
-        assert(ctx->special_eos_id >= 0);
+    // if the ring buffer is full, remove the oldest token
+    if (ctx->prev.size() >= (size_t) ctx->penalty_last_n) {
+        const auto old = ctx->prev.front();
 
-        // optimistically check if the candidates are not yet sorted/shuffled/truncated
-        if (cur_p->size > (size_t) ctx->special_eos_id && cur_p->data[ctx->special_eos_id].id == ctx->special_eos_id) {
-            cur_p->data[ctx->special_eos_id].logit = -INFINITY;
-        } else {
-            // else, search for the special EOS token
-            for (size_t i = 0; i < cur_p->size; ++i) {
-                if (cur_p->data[i].id == ctx->special_eos_id) {
-                    cur_p->data[i].logit = -INFINITY;
-                    break;
-                }
-            }
+        ctx->token_count[old]--;
+        if (ctx->token_count[old] == 0) {
+            ctx->token_count.erase(old);
         }
     }
 
-    if ((ctx->penalty_last_n == 0) ||
-        (ctx->penalty_repeat == 1.0f && ctx->penalty_freq == 0.0f && ctx->penalty_present == 0.0f)) {
-        return;
-    }
-
-    bool nl_found = false;
-    size_t nl_idx = 0;
-    float nl_logit = -INFINITY;
-    if (!ctx->penalize_nl) {
-        assert(ctx->linefeed_id >= 0);
+    ctx->prev.push_back(token);
 
-        // optimistically check if the candidates are not yet sorted/shuffled/truncated
-        if (cur_p->size > (size_t) ctx->linefeed_id && cur_p->data[ctx->linefeed_id].id == ctx->linefeed_id) {
-            nl_found = true;
-            nl_idx = ctx->linefeed_id;
-            nl_logit = cur_p->data[ctx->linefeed_id].logit;
-        } else {
-            // else, search for the linefeed token
-            for (size_t i = 0; i < cur_p->size; ++i) {
-                if (cur_p->data[i].id == ctx->linefeed_id) {
-                    nl_found = true;
-                    nl_idx = i;
-                    nl_logit = cur_p->data[i].logit;
-                    break;
-                }
-            }
-        }
+#if 0
+    // sanity check
+    std::unordered_map<llama_token, int> tmp;
+    for (int i = 0; i < std::min<int>(ctx->penalty_last_n, ctx->prev.size()); ++i) {
+        tmp[ctx->prev.rat(i)]++;
     }
 
-    // Create a frequency map to count occurrences of each token in last_tokens
-    // TODO: optimize this by maintaining the token count in the sampler context
-    using llama_token_cnt = std::unordered_map<llama_token, int>;
-    llama_token_cnt token_count;
+    assert(ctx->token_count == tmp);
+#endif
+}
+
+static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    auto * ctx = (llama_sampler_penalties *) smpl->ctx;
 
-    for (int i = 0; i < std::min<int>(ctx->penalty_last_n, ctx->prev.size()); ++i) {
-        token_count[ctx->prev.rat(i)]++;
+    if ((ctx->penalty_last_n == 0) ||
+        (ctx->penalty_repeat == 1.0f && ctx->penalty_freq == 0.0f && ctx->penalty_present == 0.0f)) {
+        return;
     }
 
     // Apply frequency and presence penalties to the cur_p
     for (size_t i = 0; i < cur_p->size; ++i) {
-        const auto token_iter = token_count.find(cur_p->data[i].id);
-        if (token_iter == token_count.end()) {
+        const auto token_iter = ctx->token_count.find(cur_p->data[i].id);
+        if (token_iter == ctx->token_count.end()) {
             continue;
         }
 
         const int count = token_iter->second;
 
+        assert(count > 0 && count <= ctx->penalty_last_n);
+
         // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
         // This is common fix for this problem, which is to multiply by the penalty instead of dividing.
         if (cur_p->data[i].logit <= 0) {
@@ -1503,30 +1473,21 @@ static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_tok
     }
 
     cur_p->sorted = false;
-
-    if (!ctx->penalize_nl && nl_found) {
-        // restore the logit of the newline token if it was penalized
-        cur_p->data[nl_idx].logit = nl_logit;
-    }
 }
 
 static void llama_sampler_penalties_reset(struct llama_sampler * smpl) {
     auto * ctx = (llama_sampler_penalties *) smpl->ctx;
     ctx->prev.clear();
+    ctx->token_count.clear();
 }
 
 static struct llama_sampler * llama_sampler_penalties_clone(const struct llama_sampler * smpl) {
     const auto * ctx = (const llama_sampler_penalties *) smpl->ctx;
     auto * result = llama_sampler_init_penalties(
-            ctx->n_vocab,
-            ctx->special_eos_id,
-            ctx->linefeed_id,
             ctx->penalty_last_n,
             ctx->penalty_repeat,
             ctx->penalty_freq,
-            ctx->penalty_present,
-            ctx->penalize_nl,
-            ctx->ignore_eos);
+            ctx->penalty_present);
 
     // copy the state
     {
@@ -1552,38 +1513,21 @@ static struct llama_sampler_i llama_sampler_penalties_i = {
 };
 
 struct llama_sampler * llama_sampler_init_penalties(
-        int32_t n_vocab,
-        llama_token special_eos_id,
-        llama_token linefeed_id,
         int32_t penalty_last_n,
         float penalty_repeat,
         float penalty_freq,
-        float penalty_present,
-        bool penalize_nl,
-        bool ignore_eos) {
-    if (linefeed_id == LLAMA_TOKEN_NULL) {
-        penalize_nl = true;
-    }
-
-    if (special_eos_id == LLAMA_TOKEN_NULL) {
-        ignore_eos = false;
-    }
-
+        float penalty_present) {
     penalty_last_n = std::max(penalty_last_n, 0);
 
     return new llama_sampler {
         /* .iface = */ &llama_sampler_penalties_i,
         /* .ctx   = */ new llama_sampler_penalties {
-            /* .n_vocab         = */ n_vocab,
-            /* .special_eos_id  = */ special_eos_id,
-            /* .linefeed_id     = */ linefeed_id,
             /* .penalty_last_n  = */ penalty_last_n,
             /* .penalty_repeat  = */ penalty_repeat,
             /* .penalty_freq    = */ penalty_freq,
             /* .penalty_present = */ penalty_present,
-            /* .penalize_nl     = */ penalize_nl,
-            /* .ignore_eos      = */ ignore_eos,
             /* .prev            = */ ring_buffer<llama_token>(penalty_last_n),
+            /* .token_count     = */ {},
         },
     };
 }
@@ -1611,7 +1555,8 @@ static void get_overlapping_token_sequences(const llama_vocab & vocab, const std
         if (word.find(str) != std::string::npos) {
             token_sequences.emplace(token_id, std::vector<llama_token>());
         } else {
-            size_t word_len = word.size(), str_len = str.size();
+            size_t word_len = word.size();
+            size_t str_len = str.size();
             size_t pos = -1;
             while ((pos = word.find(str[0], pos + 1)) != std::string::npos) {
                 bool match = true;
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 8c9aaf5a0c8..0a477d6dd85 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -738,7 +738,7 @@ struct llm_tokenizer_wpm_session {
         std::vector<std::string> words(1, "");
 
         for (const uint32_t cpt : cpts_nfd) {
-            const auto flags = unicode_cpt_flags(cpt);
+            const auto flags = unicode_cpt_flags_from_cpt(cpt);
 
             if (flags.is_whitespace) {
                 if (words.back().size()) {  // finish previous word if any
@@ -1657,7 +1657,7 @@ bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token t
 }
 
 llama_token llama_token_bos_impl(const struct llama_vocab & vocab) {
-    return vocab.special_bos_id;
+    return vocab.type != LLAMA_VOCAB_TYPE_WPM ? vocab.special_bos_id : vocab.special_cls_id;
 }
 
 llama_token llama_token_eos_impl(const struct llama_vocab & vocab) {
@@ -1867,6 +1867,10 @@ int32_t llama_detokenize_impl(
                          int32_t   text_len_max,
                             bool   remove_special,
                             bool   unparse_special) {
+    if (vocab.type == LLAMA_VOCAB_TYPE_NONE) {
+        return 0;
+    }
+
     GGML_ASSERT(vocab.tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
 
     int32_t avail = text_len_max;
diff --git a/src/llama-vocab.h b/src/llama-vocab.h
index 4bb16d2e429..a9b0da5ef3e 100644
--- a/src/llama-vocab.h
+++ b/src/llama-vocab.h
@@ -45,7 +45,7 @@ struct llama_vocab {
     id special_unk_id  = 0;
     id special_sep_id  = LLAMA_TOKEN_NULL;
     id special_pad_id  = LLAMA_TOKEN_NULL;
-    id special_cls_id  = LLAMA_TOKEN_NULL;
+    id special_cls_id  = LLAMA_TOKEN_NULL; // TODO: revisit if this is really needed https://github.com/ggerganov/llama.cpp/pull/10930
     id special_mask_id = LLAMA_TOKEN_NULL;
 
     id linefeed_id = 13;
diff --git a/src/llama.cpp b/src/llama.cpp
index dbcaddeb817..90bca3e7dd4 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -146,6 +146,7 @@ static std::string format(const char * fmt, ...) {
 
 enum llm_arch {
     LLM_ARCH_LLAMA,
+    LLM_ARCH_DECI,
     LLM_ARCH_FALCON,
     LLM_ARCH_BAICHUAN,
     LLM_ARCH_GROK,
@@ -163,6 +164,7 @@ enum llm_arch {
     LLM_ARCH_QWEN,
     LLM_ARCH_QWEN2,
     LLM_ARCH_QWEN2MOE,
+    LLM_ARCH_QWEN2VL,
     LLM_ARCH_PHI2,
     LLM_ARCH_PHI3,
     LLM_ARCH_PLAMO,
@@ -183,6 +185,7 @@ enum llm_arch {
     LLM_ARCH_OLMOE,
     LLM_ARCH_OPENELM,
     LLM_ARCH_ARCTIC,
+    LLM_ARCH_DEEPSEEK,
     LLM_ARCH_DEEPSEEK2,
     LLM_ARCH_CHATGLM,
     LLM_ARCH_BITNET,
@@ -195,61 +198,66 @@ enum llm_arch {
     LLM_ARCH_GRANITE,
     LLM_ARCH_GRANITE_MOE,
     LLM_ARCH_CHAMELEON,
+    LLM_ARCH_WAVTOKENIZER_DEC,
     LLM_ARCH_UNKNOWN,
 };
 
 static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
-    { LLM_ARCH_LLAMA,           "llama"        },
-    { LLM_ARCH_FALCON,          "falcon"       },
-    { LLM_ARCH_GROK,            "grok"         },
-    { LLM_ARCH_GPT2,            "gpt2"         },
-    { LLM_ARCH_GPTJ,            "gptj"         },
-    { LLM_ARCH_GPTNEOX,         "gptneox"      },
-    { LLM_ARCH_MPT,             "mpt"          },
-    { LLM_ARCH_BAICHUAN,        "baichuan"     },
-    { LLM_ARCH_STARCODER,       "starcoder"    },
-    { LLM_ARCH_REFACT,          "refact"       },
-    { LLM_ARCH_BERT,            "bert"         },
-    { LLM_ARCH_NOMIC_BERT,      "nomic-bert"   },
-    { LLM_ARCH_JINA_BERT_V2,    "jina-bert-v2" },
-    { LLM_ARCH_BLOOM,           "bloom"        },
-    { LLM_ARCH_STABLELM,        "stablelm"     },
-    { LLM_ARCH_QWEN,            "qwen"         },
-    { LLM_ARCH_QWEN2,           "qwen2"        },
-    { LLM_ARCH_QWEN2MOE,        "qwen2moe"     },
-    { LLM_ARCH_PHI2,            "phi2"         },
-    { LLM_ARCH_PHI3,            "phi3"         },
-    { LLM_ARCH_PLAMO,           "plamo"        },
-    { LLM_ARCH_CODESHELL,       "codeshell"    },
-    { LLM_ARCH_ORION,           "orion"        },
-    { LLM_ARCH_INTERNLM2,       "internlm2"    },
-    { LLM_ARCH_MINICPM,         "minicpm"      },
-    { LLM_ARCH_MINICPM3,        "minicpm3"     },
-    { LLM_ARCH_GEMMA,           "gemma"        },
-    { LLM_ARCH_GEMMA2,          "gemma2"       },
-    { LLM_ARCH_STARCODER2,      "starcoder2"   },
-    { LLM_ARCH_MAMBA,           "mamba"        },
-    { LLM_ARCH_XVERSE,          "xverse"       },
-    { LLM_ARCH_COMMAND_R,       "command-r"    },
-    { LLM_ARCH_DBRX,            "dbrx"         },
-    { LLM_ARCH_OLMO,            "olmo"         },
-    { LLM_ARCH_OLMO2,           "olmo2"        },
-    { LLM_ARCH_OLMOE,           "olmoe"        },
-    { LLM_ARCH_OPENELM,         "openelm"      },
-    { LLM_ARCH_ARCTIC,          "arctic"       },
-    { LLM_ARCH_DEEPSEEK2,       "deepseek2"    },
-    { LLM_ARCH_CHATGLM,         "chatglm"      },
-    { LLM_ARCH_BITNET,          "bitnet"       },
-    { LLM_ARCH_T5,              "t5"           },
-    { LLM_ARCH_T5ENCODER,       "t5encoder"    },
-    { LLM_ARCH_JAIS,            "jais"         },
-    { LLM_ARCH_NEMOTRON,        "nemotron"     },
-    { LLM_ARCH_EXAONE,          "exaone"       },
-    { LLM_ARCH_RWKV6,           "rwkv6"        },
-    { LLM_ARCH_GRANITE,         "granite"      },
-    { LLM_ARCH_GRANITE_MOE,     "granitemoe"   },
-    { LLM_ARCH_CHAMELEON,       "chameleon"    },
-    { LLM_ARCH_UNKNOWN,         "(unknown)"    },
+    { LLM_ARCH_LLAMA,            "llama"            },
+    { LLM_ARCH_DECI,             "deci"            },
+    { LLM_ARCH_FALCON,           "falcon"           },
+    { LLM_ARCH_GROK,             "grok"             },
+    { LLM_ARCH_GPT2,             "gpt2"             },
+    { LLM_ARCH_GPTJ,             "gptj"             },
+    { LLM_ARCH_GPTNEOX,          "gptneox"          },
+    { LLM_ARCH_MPT,              "mpt"              },
+    { LLM_ARCH_BAICHUAN,         "baichuan"         },
+    { LLM_ARCH_STARCODER,        "starcoder"        },
+    { LLM_ARCH_REFACT,           "refact"           },
+    { LLM_ARCH_BERT,             "bert"             },
+    { LLM_ARCH_NOMIC_BERT,       "nomic-bert"       },
+    { LLM_ARCH_JINA_BERT_V2,     "jina-bert-v2"     },
+    { LLM_ARCH_BLOOM,            "bloom"            },
+    { LLM_ARCH_STABLELM,         "stablelm"         },
+    { LLM_ARCH_QWEN,             "qwen"             },
+    { LLM_ARCH_QWEN2,            "qwen2"            },
+    { LLM_ARCH_QWEN2MOE,         "qwen2moe"         },
+    { LLM_ARCH_QWEN2VL,          "qwen2vl"          },
+    { LLM_ARCH_PHI2,             "phi2"             },
+    { LLM_ARCH_PHI3,             "phi3"             },
+    { LLM_ARCH_PLAMO,            "plamo"            },
+    { LLM_ARCH_CODESHELL,        "codeshell"        },
+    { LLM_ARCH_ORION,            "orion"            },
+    { LLM_ARCH_INTERNLM2,        "internlm2"        },
+    { LLM_ARCH_MINICPM,          "minicpm"          },
+    { LLM_ARCH_MINICPM3,         "minicpm3"         },
+    { LLM_ARCH_GEMMA,            "gemma"            },
+    { LLM_ARCH_GEMMA2,           "gemma2"           },
+    { LLM_ARCH_STARCODER2,       "starcoder2"       },
+    { LLM_ARCH_MAMBA,            "mamba"            },
+    { LLM_ARCH_XVERSE,           "xverse"           },
+    { LLM_ARCH_COMMAND_R,        "command-r"        },
+    { LLM_ARCH_DBRX,             "dbrx"             },
+    { LLM_ARCH_OLMO,             "olmo"             },
+    { LLM_ARCH_OLMO2,            "olmo2"            },
+    { LLM_ARCH_OLMOE,            "olmoe"            },
+    { LLM_ARCH_OPENELM,          "openelm"          },
+    { LLM_ARCH_ARCTIC,           "arctic"           },
+    { LLM_ARCH_DEEPSEEK,         "deepseek"         },
+    { LLM_ARCH_DEEPSEEK2,        "deepseek2"        },
+    { LLM_ARCH_CHATGLM,          "chatglm"          },
+    { LLM_ARCH_BITNET,           "bitnet"           },
+    { LLM_ARCH_T5,               "t5"               },
+    { LLM_ARCH_T5ENCODER,        "t5encoder"        },
+    { LLM_ARCH_JAIS,             "jais"             },
+    { LLM_ARCH_NEMOTRON,         "nemotron"         },
+    { LLM_ARCH_EXAONE,           "exaone"           },
+    { LLM_ARCH_RWKV6,            "rwkv6"            },
+    { LLM_ARCH_GRANITE,          "granite"          },
+    { LLM_ARCH_GRANITE_MOE,      "granitemoe"       },
+    { LLM_ARCH_CHAMELEON,        "chameleon"        },
+    { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
+    { LLM_ARCH_UNKNOWN,          "(unknown)"        },
 };
 
 enum llm_kv {
@@ -269,6 +277,7 @@ enum llm_kv {
     LLM_KV_VOCAB_SIZE,
     LLM_KV_CONTEXT_LENGTH,
     LLM_KV_EMBEDDING_LENGTH,
+    LLM_KV_FEATURES_LENGTH,
     LLM_KV_BLOCK_COUNT,
     LLM_KV_LEADING_DENSE_BLOCK_COUNT,
     LLM_KV_FEED_FORWARD_LENGTH,
@@ -300,6 +309,8 @@ enum llm_kv {
     LLM_KV_ATTENTION_VALUE_LENGTH,
     LLM_KV_ATTENTION_LAYERNORM_EPS,
     LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
+    LLM_KV_ATTENTION_GROUPNORM_EPS,
+    LLM_KV_ATTENTION_GROUPNORM_GROUPS,
     LLM_KV_ATTENTION_CAUSAL,
     LLM_KV_ATTENTION_Q_LORA_RANK,
     LLM_KV_ATTENTION_KV_LORA_RANK,
@@ -308,6 +319,7 @@ enum llm_kv {
     LLM_KV_ATTENTION_SCALE,
 
     LLM_KV_ROPE_DIMENSION_COUNT,
+    LLM_KV_ROPE_DIMENSION_SECTIONS,
     LLM_KV_ROPE_FREQ_BASE,
     LLM_KV_ROPE_SCALE_LINEAR,
     LLM_KV_ROPE_SCALING_TYPE,
@@ -362,6 +374,12 @@ enum llm_kv {
     LLM_KV_ADAPTER_TYPE,
     LLM_KV_ADAPTER_LORA_ALPHA,
 
+    LLM_KV_POSNET_EMBEDDING_LENGTH,
+    LLM_KV_POSNET_BLOCK_COUNT,
+
+    LLM_KV_CONVNEXT_EMBEDDING_LENGTH,
+    LLM_KV_CONVNEXT_BLOCK_COUNT,
+
     // deprecated:
     LLM_KV_TOKENIZER_PREFIX_ID,
     LLM_KV_TOKENIZER_SUFFIX_ID,
@@ -385,6 +403,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_VOCAB_SIZE,                        "%s.vocab_size"                        },
     { LLM_KV_CONTEXT_LENGTH,                    "%s.context_length"                    },
     { LLM_KV_EMBEDDING_LENGTH,                  "%s.embedding_length"                  },
+    { LLM_KV_FEATURES_LENGTH,                   "%s.features_length"                   },
     { LLM_KV_BLOCK_COUNT,                       "%s.block_count"                       },
     { LLM_KV_LEADING_DENSE_BLOCK_COUNT,         "%s.leading_dense_block_count"         },
     { LLM_KV_FEED_FORWARD_LENGTH,               "%s.feed_forward_length"               },
@@ -416,6 +435,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_ATTENTION_VALUE_LENGTH,           "%s.attention.value_length"           },
     { LLM_KV_ATTENTION_LAYERNORM_EPS,          "%s.attention.layer_norm_epsilon"     },
     { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,      "%s.attention.layer_norm_rms_epsilon" },
+    { LLM_KV_ATTENTION_GROUPNORM_EPS,          "%s.attention.group_norm_epsilon"     },
+    { LLM_KV_ATTENTION_GROUPNORM_GROUPS,       "%s.attention.group_norm_groups"      },
     { LLM_KV_ATTENTION_CAUSAL,                 "%s.attention.causal"                 },
     { LLM_KV_ATTENTION_Q_LORA_RANK,            "%s.attention.q_lora_rank"            },
     { LLM_KV_ATTENTION_KV_LORA_RANK,           "%s.attention.kv_lora_rank"           },
@@ -424,6 +445,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_ATTENTION_SCALE,                  "%s.attention.scale"                  },
 
     { LLM_KV_ROPE_DIMENSION_COUNT,             "%s.rope.dimension_count"                 },
+    { LLM_KV_ROPE_DIMENSION_SECTIONS,          "%s.rope.dimension_sections"              },
     { LLM_KV_ROPE_FREQ_BASE,                   "%s.rope.freq_base"                       },
     { LLM_KV_ROPE_SCALE_LINEAR,                "%s.rope.scale_linear"                    },
     { LLM_KV_ROPE_SCALING_TYPE,                "%s.rope.scaling.type"                    },
@@ -445,6 +467,12 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
 
     { LLM_KV_WKV_HEAD_SIZE,                    "%s.wkv.head_size" },
 
+    { LLM_KV_POSNET_EMBEDDING_LENGTH,          "%s.posnet.embedding_length" },
+    { LLM_KV_POSNET_BLOCK_COUNT,               "%s.posnet.block_count"      },
+
+    { LLM_KV_CONVNEXT_EMBEDDING_LENGTH,        "%s.convnext.embedding_length" },
+    { LLM_KV_CONVNEXT_BLOCK_COUNT,             "%s.convnext.block_count"      },
+
     { LLM_KV_TOKENIZER_MODEL,                  "tokenizer.ggml.model"                    },
     { LLM_KV_TOKENIZER_PRE,                    "tokenizer.ggml.pre"                      },
     { LLM_KV_TOKENIZER_LIST,                   "tokenizer.ggml.tokens"                   },
@@ -603,6 +631,22 @@ enum llm_tensor {
     LLM_TENSOR_ENC_OUTPUT_NORM,
     LLM_TENSOR_CLS,
     LLM_TENSOR_CLS_OUT,
+    LLM_TENSOR_CONV1D,
+    LLM_TENSOR_CONVNEXT_DW,
+    LLM_TENSOR_CONVNEXT_NORM,
+    LLM_TENSOR_CONVNEXT_PW1,
+    LLM_TENSOR_CONVNEXT_PW2,
+    LLM_TENSOR_CONVNEXT_GAMMA,
+    LLM_TENSOR_POS_NET_CONV1,
+    LLM_TENSOR_POS_NET_CONV2,
+    LLM_TENSOR_POS_NET_NORM,
+    LLM_TENSOR_POS_NET_NORM1,
+    LLM_TENSOR_POS_NET_NORM2,
+    LLM_TENSOR_POS_NET_ATTN_NORM,
+    LLM_TENSOR_POS_NET_ATTN_Q,
+    LLM_TENSOR_POS_NET_ATTN_K,
+    LLM_TENSOR_POS_NET_ATTN_V,
+    LLM_TENSOR_POS_NET_ATTN_OUT,
 };
 
 static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
@@ -632,6 +676,32 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
         },
     },
+    {
+        LLM_ARCH_DECI,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
+            { LLM_TENSOR_FFN_GATE_INP,    "blk.%d.ffn_gate_inp" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_GATE_EXP,    "blk.%d.ffn_gate.%d" },
+            { LLM_TENSOR_FFN_DOWN_EXP,    "blk.%d.ffn_down.%d" },
+            { LLM_TENSOR_FFN_UP_EXP,      "blk.%d.ffn_up.%d" },
+            { LLM_TENSOR_FFN_GATE_EXPS,   "blk.%d.ffn_gate_exps" },
+            { LLM_TENSOR_FFN_DOWN_EXPS,   "blk.%d.ffn_down_exps" },
+            { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
+        },
+    },
     {
         LLM_ARCH_BAICHUAN,
         {
@@ -898,6 +968,23 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
         },
     },
+    {
+        LLM_ARCH_QWEN2VL,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
     {
         LLM_ARCH_QWEN2MOE,
         {
@@ -1288,6 +1375,33 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
         },
     },
+    {
+        LLM_ARCH_DEEPSEEK,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,         "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,        "output_norm" },
+            { LLM_TENSOR_OUTPUT,             "output" },
+            { LLM_TENSOR_ROPE_FREQS,         "rope_freqs" },
+            { LLM_TENSOR_ATTN_NORM,          "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,             "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,             "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,             "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,           "blk.%d.attn_output" },
+            { LLM_TENSOR_ATTN_ROT_EMBD,      "blk.%d.attn_rot_embd" },
+            { LLM_TENSOR_FFN_GATE_INP,       "blk.%d.ffn_gate_inp" },
+            { LLM_TENSOR_FFN_NORM,           "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,           "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,           "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,             "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_GATE_EXPS,      "blk.%d.ffn_gate_exps" },
+            { LLM_TENSOR_FFN_DOWN_EXPS,      "blk.%d.ffn_down_exps" },
+            { LLM_TENSOR_FFN_UP_EXPS,        "blk.%d.ffn_up_exps" },
+            { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
+            { LLM_TENSOR_FFN_GATE_SHEXP,     "blk.%d.ffn_gate_shexp" },
+            { LLM_TENSOR_FFN_DOWN_SHEXP,     "blk.%d.ffn_down_shexp" },
+            { LLM_TENSOR_FFN_UP_SHEXP,       "blk.%d.ffn_up_shexp" },
+        },
+    },
     {
         LLM_ARCH_DEEPSEEK2,
         {
@@ -1543,6 +1657,31 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
         },
     },
+    {
+        LLM_ARCH_WAVTOKENIZER_DEC,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,        "token_embd" },
+            { LLM_TENSOR_TOKEN_EMBD_NORM,   "token_embd_norm" },
+            { LLM_TENSOR_CONV1D,            "conv1d" },
+            { LLM_TENSOR_CONVNEXT_DW,       "convnext.%d.dw" },
+            { LLM_TENSOR_CONVNEXT_NORM,     "convnext.%d.norm" },
+            { LLM_TENSOR_CONVNEXT_PW1,      "convnext.%d.pw1" },
+            { LLM_TENSOR_CONVNEXT_PW2,      "convnext.%d.pw2" },
+            { LLM_TENSOR_CONVNEXT_GAMMA,    "convnext.%d.gamma" },
+            { LLM_TENSOR_OUTPUT_NORM,       "output_norm" },
+            { LLM_TENSOR_OUTPUT,            "output" },
+            { LLM_TENSOR_POS_NET_CONV1,     "posnet.%d.conv1" },
+            { LLM_TENSOR_POS_NET_CONV2,     "posnet.%d.conv2" },
+            { LLM_TENSOR_POS_NET_NORM,      "posnet.%d.norm" },
+            { LLM_TENSOR_POS_NET_NORM1,     "posnet.%d.norm1" },
+            { LLM_TENSOR_POS_NET_NORM2,     "posnet.%d.norm2" },
+            { LLM_TENSOR_POS_NET_ATTN_NORM, "posnet.%d.attn_norm" },
+            { LLM_TENSOR_POS_NET_ATTN_Q,    "posnet.%d.attn_q" },
+            { LLM_TENSOR_POS_NET_ATTN_K,    "posnet.%d.attn_k" },
+            { LLM_TENSOR_POS_NET_ATTN_V,    "posnet.%d.attn_v" },
+            { LLM_TENSOR_POS_NET_ATTN_OUT,  "posnet.%d.attn_output" },
+        },
+    },
     {
         LLM_ARCH_UNKNOWN,
         {
@@ -1562,6 +1701,7 @@ enum llm_chat_template {
     LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
     LLM_CHAT_TEMPLATE_MISTRAL_V7,
     LLM_CHAT_TEMPLATE_PHI_3,
+    LLM_CHAT_TEMPLATE_FALCON_3,
     LLM_CHAT_TEMPLATE_ZEPHYR,
     LLM_CHAT_TEMPLATE_MONARCH,
     LLM_CHAT_TEMPLATE_GEMMA,
@@ -1579,6 +1719,8 @@ enum llm_chat_template {
     LLM_CHAT_TEMPLATE_EXAONE_3,
     LLM_CHAT_TEMPLATE_RWKV_WORLD,
     LLM_CHAT_TEMPLATE_GRANITE,
+    LLM_CHAT_TEMPLATE_GIGACHAT,
+    LLM_CHAT_TEMPLATE_MEGREZ,
     LLM_CHAT_TEMPLATE_UNKNOWN,
 };
 
@@ -1593,6 +1735,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
     { "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
     { "mistral-v7",        LLM_CHAT_TEMPLATE_MISTRAL_V7        },
     { "phi3",              LLM_CHAT_TEMPLATE_PHI_3             },
+    { "falcon3",           LLM_CHAT_TEMPLATE_FALCON_3          },
     { "zephyr",            LLM_CHAT_TEMPLATE_ZEPHYR            },
     { "monarch",           LLM_CHAT_TEMPLATE_MONARCH           },
     { "gemma",             LLM_CHAT_TEMPLATE_GEMMA             },
@@ -1610,6 +1753,8 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
     { "exaone3",           LLM_CHAT_TEMPLATE_EXAONE_3          },
     { "rwkv-world",        LLM_CHAT_TEMPLATE_RWKV_WORLD        },
     { "granite",           LLM_CHAT_TEMPLATE_GRANITE           },
+    { "gigachat",          LLM_CHAT_TEMPLATE_GIGACHAT          },
+    { "megrez",            LLM_CHAT_TEMPLATE_MEGREZ            },
 };
 
 static llm_arch llm_arch_from_string(const std::string & name) {
@@ -1794,7 +1939,7 @@ struct llama_file {
         DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
                                     NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
         if (!bufLen) {
-            ret = format("Win32 error code: %s", error_code);
+            ret = format("Win32 error code: %lx", error_code);
         } else {
             ret = lpMsgBuf;
             LocalFree(lpMsgBuf);
@@ -2132,7 +2277,7 @@ struct llama_mmap {
             HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
 
             // may fail on pre-Windows 8 systems
-            pPrefetchVirtualMemory = reinterpret_cast<decltype(pPrefetchVirtualMemory)> (GetProcAddress(hKernel32, "PrefetchVirtualMemory"));
+            pPrefetchVirtualMemory = (decltype(pPrefetchVirtualMemory))(void *) GetProcAddress(hKernel32, "PrefetchVirtualMemory");
 
             if (pPrefetchVirtualMemory) {
                 // advise the kernel to preload the mapped memory
@@ -2431,15 +2576,26 @@ static const size_t kiB = 1024;
 static const size_t MiB = 1024*kiB;
 static const size_t GiB = 1024*MiB;
 
+struct llama_hparams_posnet {
+    uint32_t n_embd;
+    uint32_t n_layer;
+};
+
+struct llama_hparams_convnext {
+    uint32_t n_embd;
+    uint32_t n_layer;
+};
+
 struct llama_hparams {
     bool vocab_only;
     bool rope_finetuned;
     bool use_par_res;
     bool swin_norm;
 
-    uint32_t n_vocab;
+    uint32_t n_vocab = 0;
     uint32_t n_ctx_train; // context size the model was trained on
     uint32_t n_embd;
+    uint32_t n_embd_features = 0;
     uint32_t n_layer;
     uint32_t n_rot;
     uint32_t n_swa = 0; // sliding window attention (SWA)
@@ -2450,6 +2606,10 @@ struct llama_hparams {
     uint32_t n_vocab_type = 0; // for BERT-style token types
     uint32_t n_rel_attn_bkts = 0;
 
+    // for WavTokenizer
+    struct llama_hparams_posnet   posnet;
+    struct llama_hparams_convnext convnext;
+
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_arr;
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
@@ -2464,6 +2624,9 @@ struct llama_hparams {
 
     float f_norm_eps;
     float f_norm_rms_eps;
+    float f_norm_group_eps;
+
+    uint32_t n_norm_groups;
 
     float f_attn_logit_softcapping = 50.0f;
     float f_final_logit_softcapping = 30.0f;
@@ -2474,11 +2637,12 @@ struct llama_hparams {
     uint32_t time_decay_extra_dim = 0;
     uint32_t wkv_head_size = 0;
 
-    float    rope_attn_factor = 1.0f;
-    float    rope_freq_base_train;
-    float    rope_freq_scale_train;
-    uint32_t n_ctx_orig_yarn;
-    float    rope_yarn_log_mul;
+    float     rope_attn_factor = 1.0f;
+    float     rope_freq_base_train;
+    float     rope_freq_scale_train;
+    uint32_t  n_ctx_orig_yarn;
+    float     rope_yarn_log_mul;
+    int       rope_sections[4];
 
     // for State Space Models
     uint32_t ssm_d_conv  = 0;
@@ -2508,63 +2672,6 @@ struct llama_hparams {
     enum llama_rope_type         rope_type               = LLAMA_ROPE_TYPE_NONE;
     enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
 
-    bool operator!=(const llama_hparams & other) const {
-        if (this->vocab_only    != other.vocab_only)    return true;
-        if (this->n_vocab       != other.n_vocab)       return true;
-        if (this->n_ctx_train   != other.n_ctx_train)   return true;
-        if (this->n_embd        != other.n_embd)        return true;
-        if (this->n_layer       != other.n_layer)       return true;
-        if (this->n_rot         != other.n_rot)         return true;
-        if (this->n_swa         != other.n_swa)         return true;
-        if (this->n_embd_head_k != other.n_embd_head_k) return true;
-        if (this->n_embd_head_v != other.n_embd_head_v) return true;
-        if (this->n_expert      != other.n_expert)      return true;
-        if (this->n_expert_used != other.n_expert_used) return true;
-
-        if (this->n_head_arr    != other.n_head_arr)    return true;
-        if (this->n_head_kv_arr != other.n_head_kv_arr) return true;
-        if (this->n_ff_arr      != other.n_ff_arr)      return true;
-
-        if (this->n_rel_attn_bkts    != other.n_rel_attn_bkts)    return true;
-        if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
-        if (this->n_lora_q           != other.n_lora_q)           return true;
-        if (this->n_lora_kv          != other.n_lora_kv)          return true;
-        if (this->n_ff_exp           != other.n_ff_exp)           return true;
-        if (this->n_ff_shexp         != other.n_ff_shexp)         return true;
-        if (this->n_expert_shared    != other.n_expert_shared)    return true;
-
-        if (this->rope_finetuned  != other.rope_finetuned)  return true;
-        if (this->n_ctx_orig_yarn != other.n_ctx_orig_yarn) return true;
-
-        if (this->ssm_d_conv  != other.ssm_d_conv)  return true;
-        if (this->ssm_d_inner != other.ssm_d_inner) return true;
-        if (this->ssm_d_state != other.ssm_d_state) return true;
-        if (this->ssm_dt_rank != other.ssm_dt_rank) return true;
-        if (this->ssm_dt_b_c_rms != other.ssm_dt_b_c_rms) return true;
-
-        if (this->rescale_every_n_layers != other.rescale_every_n_layers) return true;
-        if (this->time_mix_extra_dim     != other.time_mix_extra_dim)     return true;
-        if (this->time_decay_extra_dim   != other.time_decay_extra_dim)   return true;
-        if (this->wkv_head_size          != other.wkv_head_size)          return true;
-
-        if (this->dec_start_token_id != other.dec_start_token_id) return true;
-
-        const float EPSILON = 1e-9f;
-
-        if (!is_float_close(this->f_norm_eps,            other.f_norm_eps,            EPSILON)) return true;
-        if (!is_float_close(this->f_norm_rms_eps,        other.f_norm_rms_eps,        EPSILON)) return true;
-        if (!is_float_close(this->rope_attn_factor,      other.rope_attn_factor,      EPSILON)) return true;
-        if (!is_float_close(this->rope_freq_base_train,  other.rope_freq_base_train,  EPSILON)) return true;
-        if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
-        if (!is_float_close(this->expert_weights_scale,  other.expert_weights_scale,  EPSILON)) return true;
-        if (!is_float_close(this->rope_yarn_log_mul,     other.rope_yarn_log_mul,     EPSILON)) return true;
-        if (!is_float_close(this->f_residual_scale,      other.f_residual_scale,      EPSILON)) return true;
-        if (!is_float_close(this->f_embedding_scale,     other.f_embedding_scale,     EPSILON)) return true;
-        if (!is_float_close(this->f_attention_scale,     other.f_attention_scale,     EPSILON)) return true;
-
-        return false;
-    }
-
     uint32_t n_head(uint32_t il = 0) const {
         if (il < n_layer) {
             return n_head_arr[il];
@@ -2617,21 +2724,21 @@ struct llama_hparams {
         if (wkv_head_size != 0) {
             // for RWKV models
             return 2 * n_embd;
-        } else {
-            // TODO: maybe support other convolution strides than 1
-            // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
-            return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner;
         }
+
+        // TODO: maybe support other convolution strides than 1
+        // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
+        return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner;
     }
 
     uint32_t n_embd_v_s() const { // dimension of the recurrent state embeddings
         if (wkv_head_size != 0) {
             // corresponds to RWKV's wkv_states size
             return n_embd * wkv_head_size;
-        } else {
-            // corresponds to Mamba's ssm_states size
-            return ssm_d_state * ssm_d_inner;
         }
+
+        // corresponds to Mamba's ssm_states size
+        return ssm_d_state * ssm_d_inner;
     }
 };
 
@@ -2669,142 +2776,187 @@ struct llama_cparams {
     void * cb_eval_user_data;
 };
 
-// TODO: separate into "llama_layer_enc" and "llama_layer_dec"
-struct llama_layer {
-    llama_layer() {
-        // initialize all pointers to NULL
-        std::memset(this, 0, sizeof(*this));
-    }
+struct llama_layer_posnet {
+    // resnet
+    struct ggml_tensor * norm1   = nullptr;
+    struct ggml_tensor * norm1_b = nullptr;
+
+    struct ggml_tensor * conv1   = nullptr;
+    struct ggml_tensor * conv1_b = nullptr;
+
+    struct ggml_tensor * norm2   = nullptr;
+    struct ggml_tensor * norm2_b = nullptr;
 
+    struct ggml_tensor * conv2   = nullptr;
+    struct ggml_tensor * conv2_b = nullptr;
+
+    // attention
+    struct ggml_tensor * attn_norm   = nullptr;
+    struct ggml_tensor * attn_norm_b = nullptr;
+
+    struct ggml_tensor * attn_q   = nullptr;
+    struct ggml_tensor * attn_q_b = nullptr;
+
+    struct ggml_tensor * attn_k   = nullptr;
+    struct ggml_tensor * attn_k_b = nullptr;
+
+    struct ggml_tensor * attn_v   = nullptr;
+    struct ggml_tensor * attn_v_b = nullptr;
+
+    struct ggml_tensor * attn_o   = nullptr;
+    struct ggml_tensor * attn_o_b = nullptr;
+
+    // normalize
+    struct ggml_tensor * norm   = nullptr;
+    struct ggml_tensor * norm_b = nullptr;
+};
+
+struct llama_layer_convnext {
+    struct ggml_tensor * dw   = nullptr;
+    struct ggml_tensor * dw_b = nullptr;
+
+    struct ggml_tensor * norm   = nullptr;
+    struct ggml_tensor * norm_b = nullptr;
+
+    struct ggml_tensor * pw1   = nullptr;
+    struct ggml_tensor * pw1_b = nullptr;
+
+    struct ggml_tensor * pw2   = nullptr;
+    struct ggml_tensor * pw2_b = nullptr;
+
+    struct ggml_tensor * gamma = nullptr;
+};
+
+struct llama_layer {
     // normalization
-    struct ggml_tensor * attn_norm;
-    struct ggml_tensor * attn_norm_b;
-    struct ggml_tensor * attn_norm_2;
-    struct ggml_tensor * attn_norm_2_b;
-    struct ggml_tensor * attn_q_norm;
-    struct ggml_tensor * attn_q_norm_b;
-    struct ggml_tensor * attn_k_norm;
-    struct ggml_tensor * attn_k_norm_b;
-    struct ggml_tensor * attn_out_norm;
-    struct ggml_tensor * attn_out_norm_b;
-    struct ggml_tensor * attn_q_a_norm;
-    struct ggml_tensor * attn_kv_a_norm;
-    struct ggml_tensor * attn_sub_norm;
-    struct ggml_tensor * attn_post_norm;
-    struct ggml_tensor * ffn_sub_norm;
-    struct ggml_tensor * attn_norm_cross;
-    struct ggml_tensor * attn_norm_enc;
+    struct ggml_tensor * attn_norm       = nullptr;
+    struct ggml_tensor * attn_norm_b     = nullptr;
+    struct ggml_tensor * attn_norm_2     = nullptr;
+    struct ggml_tensor * attn_norm_2_b   = nullptr;
+    struct ggml_tensor * attn_q_norm     = nullptr;
+    struct ggml_tensor * attn_q_norm_b   = nullptr;
+    struct ggml_tensor * attn_k_norm     = nullptr;
+    struct ggml_tensor * attn_k_norm_b   = nullptr;
+    struct ggml_tensor * attn_out_norm   = nullptr;
+    struct ggml_tensor * attn_out_norm_b = nullptr;
+    struct ggml_tensor * attn_q_a_norm   = nullptr;
+    struct ggml_tensor * attn_kv_a_norm  = nullptr;
+    struct ggml_tensor * attn_sub_norm   = nullptr;
+    struct ggml_tensor * attn_post_norm  = nullptr;
+    struct ggml_tensor * ffn_sub_norm    = nullptr;
+    struct ggml_tensor * attn_norm_cross = nullptr;
+    struct ggml_tensor * attn_norm_enc   = nullptr;
 
     // attention
-    struct ggml_tensor * wq;
-    struct ggml_tensor * wk;
-    struct ggml_tensor * wv;
-    struct ggml_tensor * wo;
-    struct ggml_tensor * wqkv;
-    struct ggml_tensor * wq_a;
-    struct ggml_tensor * wq_b;
-    struct ggml_tensor * wkv_a_mqa;
-    struct ggml_tensor * wkv_b;
-    struct ggml_tensor * wq_cross;
-    struct ggml_tensor * wk_cross;
-    struct ggml_tensor * wv_cross;
-    struct ggml_tensor * wo_cross;
-    struct ggml_tensor * wq_enc;
-    struct ggml_tensor * wk_enc;
-    struct ggml_tensor * wv_enc;
-    struct ggml_tensor * wo_enc;
+    struct ggml_tensor * wq        = nullptr;
+    struct ggml_tensor * wk        = nullptr;
+    struct ggml_tensor * wv        = nullptr;
+    struct ggml_tensor * wo        = nullptr;
+    struct ggml_tensor * wqkv      = nullptr;
+    struct ggml_tensor * wq_a      = nullptr;
+    struct ggml_tensor * wq_b      = nullptr;
+    struct ggml_tensor * wkv_a_mqa = nullptr;
+    struct ggml_tensor * wkv_b     = nullptr;
+    struct ggml_tensor * wq_cross  = nullptr;
+    struct ggml_tensor * wk_cross  = nullptr;
+    struct ggml_tensor * wv_cross  = nullptr;
+    struct ggml_tensor * wo_cross  = nullptr;
+    struct ggml_tensor * wq_enc    = nullptr;
+    struct ggml_tensor * wk_enc    = nullptr;
+    struct ggml_tensor * wv_enc    = nullptr;
+    struct ggml_tensor * wo_enc    = nullptr;
 
     // attention bias
-    struct ggml_tensor * bq;
-    struct ggml_tensor * bk;
-    struct ggml_tensor * bv;
-    struct ggml_tensor * bo;
-    struct ggml_tensor * bqkv;
+    struct ggml_tensor * bq   = nullptr;
+    struct ggml_tensor * bk   = nullptr;
+    struct ggml_tensor * bv   = nullptr;
+    struct ggml_tensor * bo   = nullptr;
+    struct ggml_tensor * bqkv = nullptr;
 
     // relative position bias
-    struct ggml_tensor * attn_rel_b;
-    struct ggml_tensor * attn_rel_b_enc;
-    struct ggml_tensor * attn_rel_b_cross;
+    struct ggml_tensor * attn_rel_b       = nullptr;
+    struct ggml_tensor * attn_rel_b_enc   = nullptr;
+    struct ggml_tensor * attn_rel_b_cross = nullptr;
 
     // normalization
-    struct ggml_tensor * ffn_norm;
-    struct ggml_tensor * ffn_norm_b;
-    struct ggml_tensor * ffn_post_norm;
-    struct ggml_tensor * layer_out_norm;
-    struct ggml_tensor * layer_out_norm_b;
-    struct ggml_tensor * ffn_norm_exps;
-    struct ggml_tensor * ffn_norm_enc;
+    struct ggml_tensor * ffn_norm         = nullptr;
+    struct ggml_tensor * ffn_norm_b       = nullptr;
+    struct ggml_tensor * ffn_post_norm    = nullptr;
+    struct ggml_tensor * layer_out_norm   = nullptr;
+    struct ggml_tensor * layer_out_norm_b = nullptr;
+    struct ggml_tensor * ffn_norm_exps    = nullptr;
+    struct ggml_tensor * ffn_norm_enc     = nullptr;
 
     // ff
-    struct ggml_tensor * ffn_gate; // w1
-    struct ggml_tensor * ffn_down; // w2
-    struct ggml_tensor * ffn_up;   // w3
-    struct ggml_tensor * ffn_gate_enc;
-    struct ggml_tensor * ffn_down_enc;
-    struct ggml_tensor * ffn_up_enc;
+    struct ggml_tensor * ffn_gate     = nullptr; // w1
+    struct ggml_tensor * ffn_down     = nullptr; // w2
+    struct ggml_tensor * ffn_up       = nullptr; // w3
+    struct ggml_tensor * ffn_gate_enc = nullptr;
+    struct ggml_tensor * ffn_down_enc = nullptr;
+    struct ggml_tensor * ffn_up_enc   = nullptr;
 
     // ff MoE
-    struct ggml_tensor * ffn_gate_inp;
-    struct ggml_tensor * ffn_gate_exps;
-    struct ggml_tensor * ffn_down_exps;
-    struct ggml_tensor * ffn_up_exps ;
+    struct ggml_tensor * ffn_gate_inp  = nullptr;
+    struct ggml_tensor * ffn_gate_exps = nullptr;
+    struct ggml_tensor * ffn_down_exps = nullptr;
+    struct ggml_tensor * ffn_up_exps   = nullptr;
 
     // ff shared expert (shexp)
-    struct ggml_tensor * ffn_gate_inp_shexp;
-    struct ggml_tensor * ffn_gate_shexp;
-    struct ggml_tensor * ffn_down_shexp;
-    struct ggml_tensor * ffn_up_shexp;
+    struct ggml_tensor * ffn_gate_inp_shexp = nullptr;
+    struct ggml_tensor * ffn_gate_shexp     = nullptr;
+    struct ggml_tensor * ffn_down_shexp     = nullptr;
+    struct ggml_tensor * ffn_up_shexp       = nullptr;
 
     // ff bias
-    struct ggml_tensor * ffn_gate_b;
-    struct ggml_tensor * ffn_down_b; // b2
-    struct ggml_tensor * ffn_up_b; // b3
-    struct ggml_tensor * ffn_act;
+    struct ggml_tensor * ffn_gate_b = nullptr;
+    struct ggml_tensor * ffn_down_b = nullptr; // b2
+    struct ggml_tensor * ffn_up_b   = nullptr; // b3
+    struct ggml_tensor * ffn_act    = nullptr;
 
     // mamba proj
-    struct ggml_tensor * ssm_in;
-    struct ggml_tensor * ssm_x;
-    struct ggml_tensor * ssm_dt;
-    struct ggml_tensor * ssm_out;
+    struct ggml_tensor * ssm_in  = nullptr;
+    struct ggml_tensor * ssm_x   = nullptr;
+    struct ggml_tensor * ssm_dt  = nullptr;
+    struct ggml_tensor * ssm_out = nullptr;
 
     // mamba
-    struct ggml_tensor * ssm_conv1d;
-    struct ggml_tensor * ssm_a;
-    struct ggml_tensor * ssm_d;
+    struct ggml_tensor * ssm_conv1d = nullptr;
+    struct ggml_tensor * ssm_a      = nullptr;
+    struct ggml_tensor * ssm_d      = nullptr;
 
     // mamba bias
-    struct ggml_tensor * ssm_conv1d_b;
-    struct ggml_tensor * ssm_dt_b;
+    struct ggml_tensor * ssm_conv1d_b = nullptr;
+    struct ggml_tensor * ssm_dt_b     = nullptr;
 
     // rwkv
-    struct ggml_tensor * time_mix_w1;
-    struct ggml_tensor * time_mix_w2;
-    struct ggml_tensor * time_mix_lerp_x;
-    struct ggml_tensor * time_mix_lerp_w;
-    struct ggml_tensor * time_mix_lerp_k;
-    struct ggml_tensor * time_mix_lerp_v;
-    struct ggml_tensor * time_mix_lerp_r;
-    struct ggml_tensor * time_mix_lerp_g;
-
-    struct ggml_tensor * time_mix_first;
-    struct ggml_tensor * time_mix_decay;
-    struct ggml_tensor * time_mix_decay_w1;
-    struct ggml_tensor * time_mix_decay_w2;
-    struct ggml_tensor * time_mix_key;
-    struct ggml_tensor * time_mix_value;
-    struct ggml_tensor * time_mix_receptance;
-    struct ggml_tensor * time_mix_gate;
-
-    struct ggml_tensor * time_mix_ln;
-    struct ggml_tensor * time_mix_ln_b;
-    struct ggml_tensor * time_mix_output;
-
-    struct ggml_tensor * channel_mix_lerp_k;
-    struct ggml_tensor * channel_mix_lerp_r;
-
-    struct ggml_tensor * channel_mix_key;
-    struct ggml_tensor * channel_mix_receptance;
-    struct ggml_tensor * channel_mix_value;
+    struct ggml_tensor * time_mix_w1         = nullptr;
+    struct ggml_tensor * time_mix_w2         = nullptr;
+    struct ggml_tensor * time_mix_lerp_x     = nullptr;
+    struct ggml_tensor * time_mix_lerp_w     = nullptr;
+    struct ggml_tensor * time_mix_lerp_k     = nullptr;
+    struct ggml_tensor * time_mix_lerp_v     = nullptr;
+    struct ggml_tensor * time_mix_lerp_r     = nullptr;
+    struct ggml_tensor * time_mix_lerp_g     = nullptr;
+
+    struct ggml_tensor * time_mix_first      = nullptr;
+    struct ggml_tensor * time_mix_decay      = nullptr;
+    struct ggml_tensor * time_mix_decay_w1   = nullptr;
+    struct ggml_tensor * time_mix_decay_w2   = nullptr;
+    struct ggml_tensor * time_mix_key        = nullptr;
+    struct ggml_tensor * time_mix_value      = nullptr;
+    struct ggml_tensor * time_mix_receptance = nullptr;
+    struct ggml_tensor * time_mix_gate       = nullptr;
+
+    struct ggml_tensor * time_mix_ln     = nullptr;
+    struct ggml_tensor * time_mix_ln_b   = nullptr;
+    struct ggml_tensor * time_mix_output = nullptr;
+
+    struct ggml_tensor * channel_mix_lerp_k = nullptr;
+    struct ggml_tensor * channel_mix_lerp_r = nullptr;
+
+    struct ggml_tensor * channel_mix_key        = nullptr;
+    struct ggml_tensor * channel_mix_receptance = nullptr;
+    struct ggml_tensor * channel_mix_value      = nullptr;
 
     // long rope factors
     struct ggml_tensor * rope_long  = nullptr;
@@ -2812,13 +2964,17 @@ struct llama_layer {
     struct ggml_tensor * rope_freqs = nullptr;
 
     // bitnet scale
-    struct ggml_tensor * wq_scale;
-    struct ggml_tensor * wk_scale;
-    struct ggml_tensor * wv_scale;
-    struct ggml_tensor * wo_scale;
-    struct ggml_tensor * ffn_gate_scale;
-    struct ggml_tensor * ffn_up_scale;
-    struct ggml_tensor * ffn_down_scale;
+    struct ggml_tensor * wq_scale       = nullptr;
+    struct ggml_tensor * wk_scale       = nullptr;
+    struct ggml_tensor * wv_scale       = nullptr;
+    struct ggml_tensor * wo_scale       = nullptr;
+    struct ggml_tensor * ffn_gate_scale = nullptr;
+    struct ggml_tensor * ffn_up_scale   = nullptr;
+    struct ggml_tensor * ffn_down_scale = nullptr;
+
+    struct llama_layer_posnet posnet;
+
+    struct llama_layer_convnext convnext;
 };
 
 // very similar to llama_batch,
@@ -2949,6 +3105,9 @@ struct llama_model {
     struct ggml_tensor * cls_out   = nullptr;
     struct ggml_tensor * cls_out_b = nullptr;
 
+    struct ggml_tensor * conv1d = nullptr;
+    struct ggml_tensor * conv1d_b = nullptr;
+
     std::vector<llama_layer> layers;
 
     // gguf metadata
@@ -3033,6 +3192,7 @@ struct llama_sbatch {
     // batch indices of the output
     std::vector<size_t> out_ids;
     std::vector<llama_sbatch_seq> seq;
+
     const llama_batch * batch = nullptr;
 
     // buffers for the ubatch
@@ -3378,6 +3538,11 @@ struct llama_context {
     // whether we are computing encoder output or decoder output
     bool is_encoding = false;
 
+    // TODO: find a better way to accommodate mutli-dimension position encoding methods
+    // number of position id each token get, 1 for each token in most cases.
+    // when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate.
+    int n_pos_per_token = 1;
+
     // output of the encoder part of the encoder-decoder models
     std::vector<float> embd_enc;
     std::vector<std::set<llama_seq_id>> seq_ids_enc;
@@ -3448,6 +3613,17 @@ static int llama_get_device_count(const llama_model & model) {
     return (int) model.devices.size();
 }
 
+static struct ggml_tensor * llama_get_model_tensor(const struct llama_model * model, const char * name) {
+    auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(),
+            [name](const std::pair<std::string, struct ggml_tensor *> & it) {
+                return it.first == name;
+            });
+    if (it == model->tensors_by_name.end()) {
+        return nullptr;
+    }
+    return it->second;
+}
+
 template<typename F>
 static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
     ggml_init_params params = {
@@ -3501,7 +3677,9 @@ static bool llama_kv_cache_init(
 
     const struct llama_hparams & hparams = model.hparams;
 
-    const int64_t  n_layer = hparams.n_layer;
+    const int32_t n_layer = hparams.n_layer;
+
+    LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d\n", __func__, kv_size, offload, ggml_type_name(type_k), ggml_type_name(type_v), n_layer);
 
     cache.has_shift = false;
 
@@ -3542,10 +3720,12 @@ static bool llama_kv_cache_init(
     cache.k_l.reserve(n_layer);
     cache.v_l.reserve(n_layer);
 
-    for (int i = 0; i < (int) n_layer; i++) {
+    for (int i = 0; i < n_layer; i++) {
         const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
         const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
 
+        LLAMA_LOG_DEBUG("%s: layer %d: n_embd_k_gqa = %d, n_embd_v_gqa = %d\n", __func__, i, n_embd_k_gqa, n_embd_v_gqa);
+
         ggml_backend_buffer_type_t buft;
         if (offload) {
             auto * dev = model.dev_layer.at(i).dev;
@@ -5458,7 +5638,7 @@ static void llm_load_hparams(
     ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
 
     // get hparams kv
-    ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
+    ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab, false);
 
     // everything past this point is not vocab-related
     if (hparams.vocab_only) {
@@ -5471,6 +5651,16 @@ static void llm_load_hparams(
     ml.get_key(LLM_KV_EXPERT_COUNT,      hparams.n_expert,      false);
     ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
 
+    if (model.arch == LLM_ARCH_WAVTOKENIZER_DEC) {
+        ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
+
+        ml.get_key(LLM_KV_POSNET_EMBEDDING_LENGTH, hparams.posnet.n_embd);
+        ml.get_key(LLM_KV_POSNET_BLOCK_COUNT,      hparams.posnet.n_layer);
+
+        ml.get_key(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, hparams.convnext.n_embd);
+        ml.get_key(LLM_KV_CONVNEXT_BLOCK_COUNT,      hparams.convnext.n_layer);
+    }
+
     GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
     GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
     if (hparams.n_expert > 0) {
@@ -5479,13 +5669,13 @@ static void llm_load_hparams(
         GGML_ASSERT(hparams.n_expert_used == 0);
     }
 
-    // zero-out the per-layer hparams
+    // zero-out the array hparams
     std::fill(hparams.n_head_arr.begin(),    hparams.n_head_arr.end(),    0);
     std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
     std::fill(hparams.n_ff_arr.begin(),      hparams.n_ff_arr.end(),      0);
 
-    ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH,  hparams.n_ff_arr,   hparams.n_layer);
-    ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer);
+    ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH,  hparams.n_ff_arr,   hparams.n_layer, false);
+    ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
 
     // n_head_kv is optional, default to n_head
     hparams.n_head_kv_arr = hparams.n_head_arr;
@@ -5534,7 +5724,7 @@ static void llm_load_hparams(
 
         ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
 
-        if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
+        if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_DECI || model.arch == LLM_ARCH_FALCON) {
             if (hparams.n_rot != hparams.n_embd_head_k) {
                 throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
             }
@@ -5574,6 +5764,15 @@ static void llm_load_hparams(
                     }
                 }
             } break;
+        case LLM_ARCH_DECI:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                switch (hparams.n_layer) {
+                    case 32: model.type = e_model::MODEL_7B; break;
+                    case 80: model.type = e_model::MODEL_70B; break;
+                    default: model.type = e_model::MODEL_UNKNOWN;
+                }
+            } break;
         case LLM_ARCH_MINICPM:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -5747,6 +5946,13 @@ static void llm_load_hparams(
                     default: model.type = e_model::MODEL_UNKNOWN;
                 }
             } break;
+        case LLM_ARCH_QWEN2VL:
+            {
+                std::array<int, 4> section_dims;
+                ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, section_dims, 4, true);
+                std::copy(section_dims.begin(), section_dims.begin() + 4, std::begin(hparams.rope_sections));
+            }
+            // fall through
         case LLM_ARCH_QWEN2:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -6057,6 +6263,19 @@ static void llm_load_hparams(
                     model.type = e_model::MODEL_UNKNOWN;
                 }
             } break;
+        case LLM_ARCH_DEEPSEEK:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
+                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
+                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
+                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
+
+                switch (hparams.n_layer) {
+                    case 28: model.type = e_model::MODEL_20B; break;
+                    default: model.type = e_model::MODEL_UNKNOWN;
+                }
+            } break;
         case LLM_ARCH_DEEPSEEK2:
             {
                 bool is_lite = (hparams.n_layer == 27);
@@ -6210,6 +6429,13 @@ static void llm_load_hparams(
                     default: model.type = e_model::MODEL_UNKNOWN;
                }
             } break;
+        case LLM_ARCH_WAVTOKENIZER_DEC:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
+                ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS,    hparams.f_norm_group_eps);
+                ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
+                ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
+            } break;
         default: (void)0;
     }
 
@@ -6239,7 +6465,7 @@ static void llm_load_vocab(
         ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
         ml.get_key(LLM_KV_TOKENIZER_PRE,   tokenizer_pre, false);
 
-        if (tokenizer_model == "no_vocab") {
+        if (tokenizer_model == "no_vocab" || tokenizer_model == "none") {
             vocab.type = LLAMA_VOCAB_TYPE_NONE;
 
             // default special tokens
@@ -6377,7 +6603,8 @@ static void llm_load_vocab(
             } else if (
                     tokenizer_pre == "llama3"   ||
                     tokenizer_pre == "llama-v3" ||
-                    tokenizer_pre == "llama-bpe") {
+                    tokenizer_pre == "llama-bpe"||
+                    tokenizer_pre == "falcon3") {
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
                 vocab.tokenizer_ignore_merges = true;
                 vocab.tokenizer_add_bos = true;
@@ -6403,10 +6630,12 @@ static void llm_load_vocab(
                     tokenizer_pre == "phi-2"   ||
                     tokenizer_pre == "jina-es" ||
                     tokenizer_pre == "jina-de" ||
+                    tokenizer_pre == "gigachat"   ||
                     tokenizer_pre == "jina-v1-en" ||
                     tokenizer_pre == "jina-v2-es" ||
                     tokenizer_pre == "jina-v2-de" ||
-                    tokenizer_pre == "jina-v2-code") {
+                    tokenizer_pre == "jina-v2-code" ||
+                    tokenizer_pre == "roberta-bpe") {
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
             } else if (
                     tokenizer_pre == "refact") {
@@ -6476,6 +6705,9 @@ static void llm_load_vocab(
             } else if (
                 tokenizer_pre == "minerva-7b") {
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA;
+            } else if (
+                tokenizer_pre == "megrez") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
             } else {
                 throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
             }
@@ -7054,6 +7286,13 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
 
     LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, vocab.max_token_len);
 
+    if (model.arch == LLM_ARCH_DEEPSEEK) {
+        LLAMA_LOG_INFO("%s: n_layer_dense_lead   = %d\n",     __func__, hparams.n_layer_dense_lead);
+        LLAMA_LOG_INFO("%s: n_ff_exp             = %d\n",     __func__, hparams.n_ff_exp);
+        LLAMA_LOG_INFO("%s: n_expert_shared      = %d\n",     __func__, hparams.n_expert_shared);
+        LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n",   __func__, hparams.expert_weights_scale);
+    }
+
     if (model.arch == LLM_ARCH_DEEPSEEK2) {
         LLAMA_LOG_INFO("%s: n_layer_dense_lead   = %d\n",     __func__, hparams.n_layer_dense_lead);
         LLAMA_LOG_INFO("%s: n_lora_q             = %d\n",     __func__, hparams.n_lora_q);
@@ -7210,6 +7449,22 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
     {LLM_TENSOR_FFN_UP_EXPS,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
     // this tensor is loaded for T5, but never used
     {LLM_TENSOR_DEC_CROSS_ATTN_REL_B,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
+    {LLM_TENSOR_CONV1D,                     {LLM_TENSOR_LAYER_INPUT,     GGML_OP_IM2COL}},
+    {LLM_TENSOR_POS_NET_NORM,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_POS_NET_NORM1,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_POS_NET_NORM2,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_POS_NET_CONV1,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_IM2COL}},
+    {LLM_TENSOR_POS_NET_CONV2,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_IM2COL}},
+    {LLM_TENSOR_POS_NET_ATTN_NORM,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_POS_NET_ATTN_Q,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_POS_NET_ATTN_K,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_POS_NET_ATTN_V,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_POS_NET_ATTN_OUT,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_CONVNEXT_DW,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_IM2COL}},
+    {LLM_TENSOR_CONVNEXT_NORM,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_CONVNEXT_PW1,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_CONVNEXT_PW2,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_CONVNEXT_GAMMA,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
 };
 
 // checks if the weight tensor can be used with the specified buffer type and device
@@ -7314,6 +7569,12 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
                 ggml_tensor  * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
                 op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
             } break;
+        case GGML_OP_IM2COL:
+            {
+                const int n_embd = hparams.n_embd;
+                ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
+                op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
+            } break;
         default:
             GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
     }
@@ -7444,7 +7705,8 @@ static bool llm_load_tensors(
     model.main_gpu     = main_gpu;
     model.n_gpu_layers = n_gpu_layers;
 
-    const int n_layer     = hparams.n_layer;
+    const int n_layer = hparams.n_layer;
+
     bool use_mmap_buffer = true;
 
     // build a list of buffer types for the CPU and GPU devices
@@ -7719,6 +7981,68 @@ static bool llm_load_tensors(
                         }
                     }
                 } break;
+            case LLM_ARCH_DECI:
+                {
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    model.output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+
+                    // if output is NULL, init from the input tok embed
+                    if (model.output == NULL) {
+                        model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
+                    }
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = model.layers[i];
+                        const int64_t n_embd_k_gqa  = hparams.n_embd_k_gqa(i);
+                        const int64_t n_embd_v_gqa  = hparams.n_embd_v_gqa(i);
+                        const int64_t n_embd_gqa    = hparams.n_embd_v_gqa(i);
+                        const int64_t n_ff          = hparams.n_ff(i);
+                        const int64_t n_head        = hparams.n_head(i);
+                        const int64_t n_head_kv     = hparams.n_head_kv(i);
+
+                        if (n_head_kv == 0 && n_head > 0) {
+                            // linear attention for DeciLMCausalModel
+                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+                        }
+                        else if (n_head_kv > 0) {
+                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
+                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+                        }
+
+                        // optional bias tensors
+                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     llama_model_loader::TENSOR_NOT_REQUIRED);
+
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+                        if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
+                            layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+                            layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+                        }
+                        else {
+                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+                        }
+
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+
+                        // optional MLP bias
+                        layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                    }
+                } break;
             case LLM_ARCH_MINICPM3:
                 {
                     const int64_t n_embd_head_qk_rope = hparams.n_rot;
@@ -8167,6 +8491,7 @@ static bool llm_load_tensors(
                     }
                 } break;
             case LLM_ARCH_QWEN2:
+            case LLM_ARCH_QWEN2VL:
                 {
                     model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
@@ -8827,15 +9152,8 @@ static bool llm_load_tensors(
                         layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert}, 0);
                     }
                 } break;
-            case LLM_ARCH_DEEPSEEK2:
+            case LLM_ARCH_DEEPSEEK:
                 {
-                    const bool is_lite = (hparams.n_layer == 27);
-
-                    const int64_t n_embd_head_qk_rope = hparams.n_rot;
-                    const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
-
-                    const int64_t q_lora_rank  = hparams.n_lora_q;
-                    const int64_t kv_lora_rank = hparams.n_lora_kv;
 
                     const int64_t n_ff_exp        = hparams.n_ff_exp;
                     const int64_t n_expert_shared = hparams.n_expert_shared;
@@ -8850,23 +9168,11 @@ static bool llm_load_tensors(
                         auto & layer = model.layers[i];
 
                         layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-                        if (!is_lite) {
-                            layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
-                        }
-
-                        layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
-
-                        if (!is_lite) {
-                            layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
-                            layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
-                        } else {
-                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
-                        }
-
-                        layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
-                        layer.wkv_b     = create_tensor(tn(LLM_TENSOR_ATTN_KV_B,     "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
-                        layer.wo        = create_tensor(tn(LLM_TENSOR_ATTN_OUT,      "weight", i), {              n_head * (                      n_embd_head_v), n_embd}, 0);
 
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
                         layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
 
                         if (i < (int) hparams.n_layer_dense_lead) {
@@ -8895,18 +9201,86 @@ static bool llm_load_tensors(
                         }
                     }
                 } break;
-            case LLM_ARCH_BITNET:
+            case LLM_ARCH_DEEPSEEK2:
                 {
+                    const bool is_lite = (hparams.n_layer == 27);
+
+                    const int64_t n_embd_head_qk_rope = hparams.n_rot;
+                    const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
+
+                    const int64_t q_lora_rank  = hparams.n_lora_q;
+                    const int64_t kv_lora_rank = hparams.n_lora_kv;
+
+                    const int64_t n_ff_exp        = hparams.n_ff_exp;
+                    const int64_t n_expert_shared = hparams.n_expert_shared;
+
                     model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
                     // output
                     model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    model.output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
 
                     for (int i = 0; i < n_layer; ++i) {
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm     = create_tensor(tn(LLM_TENSOR_ATTN_NORM,     "weight", i), {n_embd}, 0);
-                        layer.attn_sub_norm = create_tensor(tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd}, 0);
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+                        if (!is_lite) {
+                            layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
+                        }
+
+                        layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
+
+                        if (!is_lite) {
+                            layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
+                            layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
+                        } else {
+                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+                        }
+
+                        layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
+                        layer.wkv_b     = create_tensor(tn(LLM_TENSOR_ATTN_KV_B,     "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
+                        layer.wo        = create_tensor(tn(LLM_TENSOR_ATTN_OUT,      "weight", i), {              n_head * (                      n_embd_head_v), n_embd}, 0);
+
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+                        if (i < (int) hparams.n_layer_dense_lead) {
+                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+                        } else {
+                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+
+                            if (n_expert == 0) {
+                                throw std::runtime_error("n_expert must be > 0");
+                            }
+                            if (n_expert_used == 0) {
+                                throw std::runtime_error("n_expert_used must be > 0");
+                            }
+
+                            // MoE branch
+                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
+                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
+                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
+
+                            // Shared expert branch
+                            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
+                            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {        n_ff_exp * n_expert_shared, n_embd}, 0);
+                            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
+                        }
+                    }
+                } break;
+            case LLM_ARCH_BITNET:
+                {
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = model.layers[i];
+
+                        layer.attn_norm     = create_tensor(tn(LLM_TENSOR_ATTN_NORM,     "weight", i), {n_embd}, 0);
+                        layer.attn_sub_norm = create_tensor(tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd}, 0);
 
                         layer.wq       = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
                         layer.wq_scale = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "scale",  i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
@@ -9197,9 +9571,9 @@ static bool llm_load_tensors(
                 } break;
             case LLM_ARCH_CHAMELEON:
                 {
-                 model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
-                 // output
+                    // output
                     model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
                     model.output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
                     // if output is NULL, init from the input tok embed
@@ -9228,6 +9602,109 @@ static bool llm_load_tensors(
                         layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
                     }
                 } break;
+            case LLM_ARCH_WAVTOKENIZER_DEC:
+                {
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd_features, n_vocab}, 0);
+
+                    model.conv1d   = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd_features, hparams.posnet.n_embd}, 0);
+                    model.conv1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"),   {1, hparams.posnet.n_embd}, 0);
+
+                    // posnet
+                    {
+                        const int64_t n_embd = hparams.posnet.n_embd;
+
+                        for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) {
+                            auto & layer = model.layers[i].posnet;
+
+                            // posnet:
+                            //
+                            //  - resnet
+                            //  - resnet
+                            //  - attn
+                            //  - resnet
+                            //  - resnet
+                            //  - norm
+                            //
+                            switch (i) {
+                                case 0:
+                                case 1:
+                                case 3:
+                                case 4:
+                                    {
+                                        layer.norm1   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd}, 0);
+                                        layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias",   i), {1, n_embd}, 0);
+
+                                        layer.conv1   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd, n_embd}, 0);
+                                        layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias",   i), {1, n_embd}, 0);
+
+                                        layer.norm2   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd}, 0);
+                                        layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias",   i), {1, n_embd}, 0);
+
+                                        layer.conv2   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd, n_embd}, 0);
+                                        layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias",   i), {1, n_embd}, 0);
+                                    } break;
+                                case 2:
+                                    {
+                                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
+                                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias",   i), {1, n_embd}, 0);
+
+                                        layer.attn_q      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q,    "weight", i), {1, n_embd, n_embd}, 0);
+                                        layer.attn_q_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q,    "bias",   i), {1, n_embd}, 0);
+
+                                        layer.attn_k      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K,    "weight", i), {1, n_embd, n_embd}, 0);
+                                        layer.attn_k_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K,    "bias",   i), {1, n_embd}, 0);
+
+                                        layer.attn_v      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V,    "weight", i), {1, n_embd, n_embd}, 0);
+                                        layer.attn_v_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V,    "bias",   i), {1, n_embd}, 0);
+
+                                        layer.attn_o      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT,  "weight", i), {1, n_embd, n_embd}, 0);
+                                        layer.attn_o_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT,  "bias",   i), {1, n_embd}, 0);
+                                    } break;
+                                case 5:
+                                    {
+                                        layer.norm   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
+                                        layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias",   i), {1, n_embd}, 0);
+                                    } break;
+                                default: GGML_ABORT("unknown posnet layer");
+                            };
+                        }
+                    }
+
+                    GGML_ASSERT(hparams.posnet.n_embd == hparams.convnext.n_embd);
+
+                    model.tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {hparams.posnet.n_embd}, 0);
+                    model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {hparams.posnet.n_embd}, 0);
+
+                    // convnext
+                    {
+                        const int64_t n_embd = hparams.convnext.n_embd;
+
+                        for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) {
+                            auto & layer = model.layers[i].convnext;
+
+                            layer.dw     = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW,    "weight", i), {7, 1, n_embd}, 0);
+                            layer.dw_b   = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW,    "bias",   i), {1, n_embd}, 0);
+
+                            layer.norm   = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM,  "weight", i), {n_embd}, 0);
+                            layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM,  "bias",   i), {n_embd}, 0);
+
+                            layer.pw1    = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1,   "weight", i), {n_embd, n_ff}, 0);
+                            layer.pw1_b  = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1,   "bias",   i), {n_ff}, 0);
+
+                            layer.pw2    = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2,   "weight", i), {n_ff, n_embd}, 0);
+                            layer.pw2_b  = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2,   "bias",   i), {n_embd}, 0);
+
+                            layer.gamma  = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd}, 0);
+                        }
+
+                        // output
+                        model.output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                        model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
+                    }
+
+                    model.output   = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);
+                    model.output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"),   {n_embd}, 0);
+                } break;
             default:
                 throw std::runtime_error("unknown architecture");
         }
@@ -9447,6 +9924,7 @@ enum llm_ffn_gate_type {
 enum llm_norm_type {
     LLM_NORM,
     LLM_NORM_RMS,
+    LLM_NORM_GROUP,
 };
 
 static struct ggml_tensor * llm_build_inp_embd(
@@ -9467,7 +9945,7 @@ static struct ggml_tensor * llm_build_inp_embd(
 
         inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
     } else {
-       lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
+        lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
         inpL = lctx.inp_embd;
         ggml_set_input(lctx.inp_embd);
     }
@@ -9588,8 +10066,14 @@ static struct ggml_tensor * llm_build_norm(
          const llm_build_cb & cb,
                         int   il) {
     switch (type) {
-        case LLM_NORM:     cur = ggml_norm    (ctx, cur, hparams.f_norm_eps);     break;
-        case LLM_NORM_RMS: cur = ggml_rms_norm(ctx, cur, hparams.f_norm_rms_eps); break;
+        case LLM_NORM:       cur = ggml_norm      (ctx, cur, hparams.f_norm_eps);     break;
+        case LLM_NORM_RMS:   cur = ggml_rms_norm  (ctx, cur, hparams.f_norm_rms_eps); break;
+        case LLM_NORM_GROUP:
+            {
+                cur = ggml_reshape_3d(ctx, cur, cur->ne[0], 1, cur->ne[1]);
+                cur = ggml_group_norm(ctx, cur, hparams.n_norm_groups, hparams.f_norm_group_eps);
+                cur = ggml_reshape_2d(ctx, cur, cur->ne[0],    cur->ne[2]);
+            } break;
     }
 
     if (mw || mb) {
@@ -10928,6 +11412,167 @@ struct llm_build_context {
         return gf;
     }
 
+    struct ggml_cgraph * build_deci() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+
+        // mutable variable, needed during the last layer of the computation to skip unused tokens
+        int32_t n_tokens = this->n_tokens;
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+            const int64_t n_head_kv = hparams.n_head_kv(il);
+            const int64_t n_head    = hparams.n_head(il);
+
+            if (n_head == 0) {
+                // attention-free layer of Llama-3_1-Nemotron-51B
+                cur = inpL;
+            } else {
+                // norm
+                cur = llm_build_norm(ctx0, inpL, hparams,
+                        model.layers[il].attn_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(cur, "attn_norm", il);
+            }
+
+            if (n_head > 0 && n_head_kv == 0) {
+                // "linear attention" of Llama-3_1-Nemotron-51B
+                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
+                cb(cur, "wo", il);
+            } else if (n_head > 0) {
+                // self-attention
+                // rope freq factors for llama3; may return nullptr for llama2 and other models
+                struct ggml_tensor * rope_factors = build_rope_factors(il);
+
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                if (model.layers[il].bq) {
+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                    cb(Qcur, "Qcur", il);
+                }
+
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                if (model.layers[il].bk) {
+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                    cb(Kcur, "Kcur", il);
+                }
+
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                if (model.layers[il].bv) {
+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                    cb(Vcur, "Vcur", il);
+                }
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                n_tokens = n_outputs;
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            // For Granite architecture
+            if (hparams.f_residual_scale) {
+                cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
+            }
+
+            // modified to support attention-free layer of Llama-3_1-Nemotron-51B
+            struct ggml_tensor * ffn_inp = cur;
+            if (n_head > 0) {
+                ffn_inp = ggml_add(ctx0, cur, inpSA);
+                cb(ffn_inp, "ffn_inp", il);
+            }
+
+            // feed-forward network
+            if (model.layers[il].ffn_gate_inp == nullptr) {
+                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                        model.layers[il].ffn_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = llm_build_ffn(ctx0, lctx, cur,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                        model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            // For Granite architecture
+            if (hparams.f_residual_scale) {
+                cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "ffn_out", il);
+
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+
+        // For Granite architecture
+        if (hparams.f_logit_scale) {
+            cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
+        }
+
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
     struct ggml_cgraph * build_baichuan() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
@@ -12556,6 +13201,124 @@ struct llm_build_context {
         return gf;
     }
 
+    struct ggml_cgraph * build_qwen2vl() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        lctx.inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens * 4);
+        cb(lctx.inp_pos, "inp_pos", -1);
+        ggml_set_input(lctx.inp_pos);
+        struct ggml_tensor * inp_pos = lctx.inp_pos;
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        int sections[4];
+        std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_rope_multi(
+                    ctx0,
+                    ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
+                    n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_multi(
+                    ctx0,
+                    ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "ffn_norm", il);
+
+            cur = llm_build_ffn(ctx0, lctx, cur,
+                    model.layers[il].ffn_up,   NULL, NULL,
+                    model.layers[il].ffn_gate, NULL, NULL,
+                    model.layers[il].ffn_down, NULL, NULL,
+                    NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+            cb(cur, "ffn_out", il);
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
     struct ggml_cgraph * build_qwen2moe() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
@@ -12839,7 +13602,13 @@ struct llm_build_context {
         struct ggml_tensor * inp_pos = build_inp_pos();
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa();
+        struct ggml_tensor * KQ_mask = nullptr;
+        if (hparams.n_swa == 0) {
+            // Phi-4 doesn't use sliding window attention
+            KQ_mask = build_inp_KQ_mask();
+        } else {
+            KQ_mask = build_inp_KQ_mask_swa();
+        }
 
         for (int il = 0; il < n_layer; ++il) {
             auto residual = inpL;
@@ -12897,7 +13666,7 @@ struct llm_build_context {
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask_swa, n_tokens, kv_head, n_kv, 1.0f, cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -14900,29 +15669,161 @@ struct llm_build_context {
                         LLM_NORM, cb, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, lctx, cur,
-                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
-                        NULL,                      NULL,                        NULL,
-                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
-                        NULL,
-                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
-                cb(cur, "ffn_out", il);
+                cur = llm_build_ffn(ctx0, lctx, cur,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                        NULL,                      NULL,                        NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+                        NULL,
+                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
+                cb(cur, "ffn_out", il);
+
+                cur = ggml_add(ctx0, cur, ffn_inp);
+                cur = lctx.cvec.apply_to(ctx0, cur, il);
+                cb(cur, "l_out", il);
+
+                // input for next layer
+                inpL = cur;
+            }
+        }
+
+        cur = llm_build_norm(ctx0, inpL, hparams,
+                model.output_norm,
+                model.output_norm_b,
+                LLM_NORM, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_arctic() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+
+        // mutable variable, needed during the last layer of the computation to skip unused tokens
+        int32_t n_tokens = this->n_tokens;
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, NULL,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                n_tokens = n_outputs;
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "ffn_norm", il);
+
+            cur = llm_build_ffn(ctx0, lctx, cur,
+                    model.layers[il].ffn_up,   NULL, NULL,
+                    model.layers[il].ffn_gate, NULL, NULL,
+                    model.layers[il].ffn_down, NULL, NULL,
+                    NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+            cb(cur, "ffn_out", il);
+
+            struct ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp);
+            cb(ffn_out, "ffn_out", il);
+
+            // MoE
+            cur = llm_build_norm(ctx0, inpSA, hparams,
+                    model.layers[il].ffn_norm_exps, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "ffn_norm_exps", il);
+
+            cur = llm_build_moe_ffn(ctx0, lctx, cur,
+                    model.layers[il].ffn_gate_inp,
+                    model.layers[il].ffn_up_exps,
+                    model.layers[il].ffn_gate_exps,
+                    model.layers[il].ffn_down_exps,
+                    n_expert, n_expert_used,
+                    LLM_FFN_SILU, true,
+                    false, 0.0,
+                    cb, il);
+            cb(cur, "ffn_moe_out", il);
+
+            cur = ggml_add(ctx0, cur, ffn_out);
+            cb(cur, "ffn_out", il);
 
-                cur = ggml_add(ctx0, cur, ffn_inp);
-                cur = lctx.cvec.apply_to(ctx0, cur, il);
-                cb(cur, "l_out", il);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
 
-                // input for next layer
-                inpL = cur;
-            }
+            // input for next layer
+            inpL = cur;
         }
 
-        cur = llm_build_norm(ctx0, inpL, hparams,
-                model.output_norm,
-                model.output_norm_b,
-                LLM_NORM, cb, -1);
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
         cb(cur, "result_norm", -1);
 
+        // lm_head
         cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
         cb(cur, "result_output", -1);
 
@@ -14931,7 +15832,7 @@ struct llm_build_context {
         return gf;
     }
 
-    struct ggml_cgraph * build_arctic() {
+    struct ggml_cgraph * build_deepseek() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
         // mutable variable, needed during the last layer of the computation to skip unused tokens
@@ -14951,7 +15852,7 @@ struct llm_build_context {
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
         struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
-
+        const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
@@ -14963,33 +15864,48 @@ struct llm_build_context {
 
             // self-attention
             {
+                // rope freq factors for llama3; may return nullptr for llama2 and other models
+                struct ggml_tensor * rope_factors = build_rope_factors(il);
+
                 // compute Q and K and RoPE them
                 struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
+                if (model.layers[il].bq) {
+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                    cb(Qcur, "Qcur", il);
+                }
 
                 struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
+                if (model.layers[il].bk) {
+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                    cb(Kcur, "Kcur", il);
+                }
 
                 struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
+                if (model.layers[il].bv) {
+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                    cb(Vcur, "Vcur", il);
+                }
 
                 Qcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
                     n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
                     n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                        model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -15000,46 +15916,53 @@ struct llm_build_context {
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
 
+
             struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
             cb(ffn_inp, "ffn_inp", il);
 
-            // feed-forward network
             cur = llm_build_norm(ctx0, ffn_inp, hparams,
                     model.layers[il].ffn_norm, NULL,
                     LLM_NORM_RMS, cb, il);
             cb(cur, "ffn_norm", il);
 
-            cur = llm_build_ffn(ctx0, lctx, cur,
-                    model.layers[il].ffn_up,   NULL, NULL,
-                    model.layers[il].ffn_gate, NULL, NULL,
-                    model.layers[il].ffn_down, NULL, NULL,
-                    NULL,
-                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-            cb(cur, "ffn_out", il);
-
-            struct ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp);
-            cb(ffn_out, "ffn_out", il);
-
-            // MoE
-            cur = llm_build_norm(ctx0, inpSA, hparams,
-                    model.layers[il].ffn_norm_exps, NULL,
-                    LLM_NORM_RMS, cb, il);
-            cb(cur, "ffn_norm_exps", il);
+            if ((uint32_t) il < hparams.n_layer_dense_lead) {
+                cur = llm_build_ffn(ctx0, lctx, cur,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+                cb(cur, "ffn_out", il);
+            } else {
+                // MoE branch
+                ggml_tensor * moe_out =
+                        llm_build_moe_ffn(ctx0, lctx, cur,
+                            model.layers[il].ffn_gate_inp,
+                            model.layers[il].ffn_up_exps,
+                            model.layers[il].ffn_gate_exps,
+                            model.layers[il].ffn_down_exps,
+                            n_expert, n_expert_used,
+                            LLM_FFN_SILU, false,
+                            false, hparams.expert_weights_scale,
+                            cb, il);
+                cb(moe_out, "ffn_moe_out", il);
 
-            cur = llm_build_moe_ffn(ctx0, lctx, cur,
-                    model.layers[il].ffn_gate_inp,
-                    model.layers[il].ffn_up_exps,
-                    model.layers[il].ffn_gate_exps,
-                    model.layers[il].ffn_down_exps,
-                    n_expert, n_expert_used,
-                    LLM_FFN_SILU, true,
-                    false, 0.0,
-                    cb, il);
-            cb(cur, "ffn_moe_out", il);
+                // FFN shared expert
+                {
+                    ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur,
+                            model.layers[il].ffn_up_shexp,   NULL, NULL,
+                            model.layers[il].ffn_gate_shexp, NULL, NULL,
+                            model.layers[il].ffn_down_shexp, NULL, NULL,
+                            NULL,
+                            LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+                    cb(ffn_shexp, "ffn_shexp", il);
 
-            cur = ggml_add(ctx0, cur, ffn_out);
-            cb(cur, "ffn_out", il);
+                    cur = ggml_add(ctx0, moe_out, ffn_shexp);
+                    cb(cur, "ffn_out", il);
+                }
+            }
 
+            cur = ggml_add(ctx0, cur, ffn_inp);
             cur = lctx.cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
@@ -15056,6 +15979,7 @@ struct llm_build_context {
 
         // lm_head
         cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -15442,7 +16366,7 @@ struct llm_build_context {
         return gf;
     }
 
-    struct ggml_cgraph * build_t5_encoder() {
+    struct ggml_cgraph * build_t5_enc() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
         // mutable variable, needed during the last layer of the computation to skip unused tokens
@@ -15574,7 +16498,7 @@ struct llm_build_context {
         return gf;
     }
 
-    struct ggml_cgraph * build_t5_decoder() {
+    struct ggml_cgraph * build_t5_dec() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
         // mutable variable, needed during the last layer of the computation to skip unused tokens
@@ -16523,6 +17447,158 @@ struct llm_build_context {
 
         return gf;
     }
+
+    struct ggml_cgraph * build_wavtokenizer_dec() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        cur = ggml_cont(ctx0, ggml_transpose(ctx0, inpL));
+
+        cur = ggml_conv_1d_ph(ctx0, model.conv1d, cur, 1, 1);
+        cur = ggml_add(ctx0, cur, model.conv1d_b);
+
+        // posnet
+        for (uint32_t il = 0; il < hparams.posnet.n_layer; ++il) {
+            const auto & layer = model.layers[il].posnet;
+
+            inpL = cur;
+
+            switch (il) {
+                case 0:
+                case 1:
+                case 3:
+                case 4:
+                    {
+                        cur = llm_build_norm(ctx0, cur, hparams,
+                                layer.norm1,
+                                layer.norm1_b,
+                                LLM_NORM_GROUP, cb, 0);
+
+                        cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
+
+                        cur = ggml_conv_1d_ph(ctx0, layer.conv1, cur, 1, 1);
+                        cur = ggml_add(ctx0, cur, layer.conv1_b);
+
+                        cur = llm_build_norm(ctx0, cur, hparams,
+                                layer.norm2,
+                                layer.norm2_b,
+                                LLM_NORM_GROUP, cb, 0);
+
+                        cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
+
+                        cur = ggml_conv_1d_ph(ctx0, layer.conv2, cur, 1, 1);
+                        cur = ggml_add(ctx0, cur, layer.conv2_b);
+
+                        cur = ggml_add(ctx0, cur, inpL);
+                    } break;
+                case 2:
+                    {
+                        cur = llm_build_norm(ctx0, cur, hparams,
+                                layer.attn_norm,
+                                layer.attn_norm_b,
+                                LLM_NORM_GROUP, cb, 0);
+
+                        struct ggml_tensor * q;
+                        struct ggml_tensor * k;
+                        struct ggml_tensor * v;
+
+                        q = ggml_conv_1d_ph(ctx0, layer.attn_q, cur, 1, 1);
+                        k = ggml_conv_1d_ph(ctx0, layer.attn_k, cur, 1, 1);
+                        v = ggml_conv_1d_ph(ctx0, layer.attn_v, cur, 1, 1);
+
+                        q = ggml_add(ctx0, q, layer.attn_q_b);
+                        k = ggml_add(ctx0, k, layer.attn_k_b);
+                        v = ggml_add(ctx0, v, layer.attn_v_b);
+
+                        q = ggml_cont(ctx0, ggml_transpose(ctx0, q));
+                        k = ggml_cont(ctx0, ggml_transpose(ctx0, k));
+
+                        struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
+
+                        kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f/sqrtf(float(hparams.posnet.n_embd)), 0.0f);
+
+                        cur = ggml_mul_mat(ctx0, kq, v);
+
+                        cur = ggml_conv_1d_ph(ctx0, layer.attn_o, cur, 1, 1);
+                        cur = ggml_add(ctx0, cur, layer.attn_o_b);
+
+                        cur = ggml_add(ctx0, cur, inpL);
+                    } break;
+                case 5:
+                    {
+                        cur = llm_build_norm(ctx0, cur, hparams,
+                                layer.norm,
+                                layer.norm_b,
+                                LLM_NORM_GROUP, cb, 0);
+                    } break;
+                default: GGML_ABORT("unknown posnet layer");
+            };
+        }
+
+        cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.tok_norm,
+                model.tok_norm_b,
+                LLM_NORM, cb, -1);
+
+        cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+
+        inpL = cur;
+
+        // convnext
+        for (uint32_t il = 0; il < hparams.convnext.n_layer; ++il) {
+            const auto & layer = model.layers[il].convnext;
+
+            cur = inpL;
+
+            cur = ggml_conv_1d_dw_ph(ctx0, layer.dw, cur, 1, 1);
+            cur = ggml_add(ctx0, cur, layer.dw_b);
+
+            cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+
+            cur = llm_build_norm(ctx0, cur, hparams,
+                    layer.norm,
+                    layer.norm_b,
+                    LLM_NORM, cb, -1);
+
+            cur = llm_build_ffn(ctx0, lctx, cur,
+                    layer.pw1, layer.pw1_b, NULL,
+                    NULL,      NULL,        NULL,
+                    layer.pw2, layer.pw2_b, NULL,
+                    NULL,
+                    LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
+
+            cur = ggml_mul(ctx0, cur, layer.gamma);
+
+            cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+
+            inpL = ggml_add(ctx0, cur, inpL);
+        }
+
+        cur = inpL;
+
+        cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm,
+                model.output_norm_b,
+                LLM_NORM, cb, -1);
+
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+
+        cur = ggml_add(ctx0, cur, model.output_b);
+        cb(cur, "result_embd", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
 };
 
 static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -16611,6 +17687,10 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_llama();
             } break;
+        case LLM_ARCH_DECI:
+            {
+                result = llm.build_deci();
+            } break;
         case LLM_ARCH_BAICHUAN:
             {
                 result = llm.build_baichuan();
@@ -16657,6 +17737,11 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_qwen2();
             } break;
+        case LLM_ARCH_QWEN2VL:
+            {
+                lctx.n_pos_per_token = 4;
+                result = llm.build_qwen2vl();
+            } break;
         case LLM_ARCH_QWEN2MOE:
             {
                 result = llm.build_qwen2moe();
@@ -16745,6 +17830,10 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_arctic();
             } break;
+        case LLM_ARCH_DEEPSEEK:
+            {
+                result = llm.build_deepseek();
+            } break;
         case LLM_ARCH_DEEPSEEK2:
             {
                 result = llm.build_deepseek2();
@@ -16760,14 +17849,14 @@ static struct ggml_cgraph * llama_build_graph(
         case LLM_ARCH_T5:
             {
                 if (lctx.is_encoding) {
-                    result = llm.build_t5_encoder();
+                    result = llm.build_t5_enc();
                 } else {
-                    result = llm.build_t5_decoder();
+                    result = llm.build_t5_dec();
                 }
             } break;
         case LLM_ARCH_T5ENCODER:
             {
-                result = llm.build_t5_encoder();
+                result = llm.build_t5_enc();
             } break;
         case LLM_ARCH_JAIS:
             {
@@ -16789,6 +17878,10 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_chameleon();
             } break;
+        case LLM_ARCH_WAVTOKENIZER_DEC:
+            {
+                result = llm.build_wavtokenizer_dec();
+            } break;
         default:
             GGML_ABORT("fatal error");
     }
@@ -16875,35 +17968,40 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
 
     if (ubatch.pos && lctx.inp_pos) {
         const int64_t n_tokens = ubatch.n_tokens;
-
-        ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
+        auto n_pos = lctx.n_pos_per_token;
+        ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*n_pos*ggml_element_size(lctx.inp_pos));
     }
 
     if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
-        GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
-        const int64_t n_tokens = ubatch.n_tokens;
+        //GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
+
+        if (!lctx.inp_out_ids) {
+            LLAMA_LOG_WARN("%s: 'lctx.inp_out_ids' is not created\n", __func__);
+        } else {
+            const int64_t n_tokens = ubatch.n_tokens;
 
-        GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer));
-        int32_t * data = (int32_t *) lctx.inp_out_ids->data;
+            GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer));
+            int32_t * data = (int32_t *) lctx.inp_out_ids->data;
 
-        if (lctx.n_outputs == n_tokens) {
-            for (int i = 0; i < n_tokens; ++i) {
-                data[i] = i;
-            }
-        } else if (ubatch.output) {
-            int32_t n_outputs = 0;
-            for (int i = 0; i < n_tokens; ++i) {
-                if (ubatch.output[i]) {
-                    data[n_outputs++] = i;
+            if (lctx.n_outputs == n_tokens) {
+                for (int i = 0; i < n_tokens; ++i) {
+                    data[i] = i;
                 }
+            } else if (ubatch.output) {
+                int32_t n_outputs = 0;
+                for (int i = 0; i < n_tokens; ++i) {
+                    if (ubatch.output[i]) {
+                        data[n_outputs++] = i;
+                    }
+                }
+                // the graph needs to have been passed the correct number of outputs
+                GGML_ASSERT(lctx.n_outputs == n_outputs);
+            } else if (lctx.n_outputs == 1) {
+                // only keep last output
+                data[0] = n_tokens - 1;
+            } else {
+                GGML_ASSERT(lctx.n_outputs == 0);
             }
-            // the graph needs to have been passed the correct number of outputs
-            GGML_ASSERT(lctx.n_outputs == n_outputs);
-        } else if (lctx.n_outputs == 1) {
-            // only keep last output
-            data[0] = n_tokens - 1;
-        } else {
-            GGML_ASSERT(lctx.n_outputs == 0);
         }
     }
 
@@ -17574,6 +18672,7 @@ static int llama_decode_internal(
             embd = nullptr; // do not extract embeddings when not needed
             GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
         }
+
         // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
 
         ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
@@ -19962,10 +21061,12 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
         case LLM_ARCH_T5ENCODER:
         case LLM_ARCH_JAIS:
         case LLM_ARCH_RWKV6:
+        case LLM_ARCH_WAVTOKENIZER_DEC:
             return LLAMA_ROPE_TYPE_NONE;
 
         // use what we call a normal RoPE, operating on pairs of consecutive head values
         case LLM_ARCH_LLAMA:
+        case LLM_ARCH_DECI:
         case LLM_ARCH_BAICHUAN:
         case LLM_ARCH_STARCODER:
         case LLM_ARCH_PLAMO:
@@ -19976,6 +21077,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
         case LLM_ARCH_COMMAND_R:
         case LLM_ARCH_OLMO:
         case LLM_ARCH_ARCTIC:
+        case LLM_ARCH_DEEPSEEK:
         case LLM_ARCH_DEEPSEEK2:
         case LLM_ARCH_CHATGLM:
         case LLM_ARCH_GRANITE:
@@ -20009,6 +21111,9 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
         case LLM_ARCH_MINICPM3:
             return LLAMA_ROPE_TYPE_NEOX;
 
+        case LLM_ARCH_QWEN2VL:
+            return LLAMA_ROPE_TYPE_MROPE;
+
         // all model arches should be listed explicitly here
         case LLM_ARCH_UNKNOWN:
             GGML_ABORT("unknown architecture");
@@ -20075,17 +21180,6 @@ uint64_t llama_model_n_params(const struct llama_model * model) {
     return model->n_elements;
 }
 
-struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
-    auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(),
-            [name](const std::pair<std::string, struct ggml_tensor *> & it) {
-                return it.first == name;
-            });
-    if (it == model->tensors_by_name.end()) {
-        return nullptr;
-    }
-    return it->second;
-}
-
 bool llama_model_has_encoder(const struct llama_model * model) {
     switch (model->arch) {
         case LLM_ARCH_T5:        return true;
@@ -21577,7 +22671,7 @@ float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
                 throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
             }
         } else if ((size_t) i >= ctx->output_ids.size()) {
-            throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
+            throw std::runtime_error(format("out of range [0, %zu)", ctx->output_ids.size()));
         } else {
             j = ctx->output_ids[i];
         }
@@ -21794,6 +22888,8 @@ static llm_chat_template llama_chat_detect_template(const std::string & tmpl) {
         }
     } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
         return LLM_CHAT_TEMPLATE_PHI_3;
+    } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
+        return LLM_CHAT_TEMPLATE_FALCON_3;
     } else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
         return LLM_CHAT_TEMPLATE_ZEPHYR;
     } else if (tmpl_contains("bos_token + message['role']")) {
@@ -21838,6 +22934,10 @@ static llm_chat_template llama_chat_detect_template(const std::string & tmpl) {
         return LLM_CHAT_TEMPLATE_RWKV_WORLD;
     } else if (tmpl_contains("<|start_of_role|>")) {
         return LLM_CHAT_TEMPLATE_GRANITE;
+    } else if (tmpl_contains("message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1]")) {
+        return LLM_CHAT_TEMPLATE_GIGACHAT;
+    } else if (tmpl_contains("<|role_start|>")) {
+        return LLM_CHAT_TEMPLATE_MEGREZ;
     }
     return LLM_CHAT_TEMPLATE_UNKNOWN;
 }
@@ -21944,6 +23044,15 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<|assistant|>\n";
         }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_FALCON_3) {
+        // Falcon 3
+        for (auto message : chat) {
+            std::string role(message->role);
+            ss << "<|" << role << "|>\n" << message->content << "\n";
+        }
+        if (add_ass) {
+            ss << "<|assistant|>\n";
+        }
     } else if (tmpl == LLM_CHAT_TEMPLATE_ZEPHYR) {
         // zephyr template
         for (auto message : chat) {
@@ -22161,6 +23270,42 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<|start_of_role|>assistant<|end_of_role|>\n";
         }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_GIGACHAT) {
+        // GigaChat template
+        bool has_system = !chat.empty() && std::string(chat[0]->role) == "system";
+
+        // Handle system message if present
+        if (has_system) {
+            ss << "<s>" << chat[0]->content << "<|message_sep|>";
+        } else {
+            ss << "<s>";
+        }
+
+        // Process remaining messages
+        for (size_t i = has_system ? 1 : 0; i < chat.size(); i++) {
+            std::string role(chat[i]->role);
+            if (role == "user") {
+                ss << "user<|role_sep|>" << chat[i]->content << "<|message_sep|>"
+                << "available functions<|role_sep|>[]<|message_sep|>";
+            } else if (role == "assistant") {
+                ss << "assistant<|role_sep|>" << chat[i]->content << "<|message_sep|>";
+            }
+        }
+
+        // Add generation prompt if needed
+        if (add_ass) {
+            ss << "assistant<|role_sep|>";
+        }
+    }  else if (tmpl == LLM_CHAT_TEMPLATE_MEGREZ) {
+        // Megrez template
+        for (auto message : chat) {
+            std::string role(message->role);
+            ss << "<|role_start|>" << role << "<|role_end|>" << message->content << "<|turn_end|>";
+        }
+
+        if (add_ass) {
+            ss << "<|role_start|>assistant<|role_end|>";
+        }
     } else {
         // template not supported
         return -1;
@@ -22180,15 +23325,15 @@ int32_t llama_chat_apply_template(
     std::string curr_tmpl(tmpl == nullptr ? "" : tmpl);
     if (tmpl == nullptr) {
         GGML_ASSERT(model != nullptr);
-        // load template from model
-        std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
-        std::string template_key = "tokenizer.chat_template";
-        int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
-        if (res < 0) {
+
+        // load template from model, if available
+        const auto & it = model->gguf_kv.find("tokenizer.chat_template");
+        if (it != model->gguf_kv.end() && it->second.size() > 0) {
+            curr_tmpl = it->second;
+        }
+        else {
             // worst case: there is no information about template, we will use chatml by default
-            curr_tmpl = "chatml"; // see llama_chat_apply_template_internal
-        } else {
-            curr_tmpl = std::string(model_template.data(), model_template.size());
+            curr_tmpl = "chatml";  // see llama_chat_apply_template_internal
         }
     }
 
diff --git a/src/unicode.cpp b/src/unicode.cpp
index 3d459263525..8ed6b1a51c2 100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -71,15 +71,15 @@ uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
     throw std::invalid_argument("failed to convert utf8 to codepoint");
 }
 
-//static std::vector<uint16_t> unicode_cpt_to_utf16(uint32_t cp) {
+//static std::vector<uint16_t> unicode_cpt_to_utf16(uint32_t cpt) {
 //    std::vector<uint16_t> result;
-//    if (/* 0x0000 <= cp && */ cp <= 0xffff) {
-//        result.emplace_back(cp);
+//    if (/* 0x0000 <= cpt && */ cpt <= 0xffff) {
+//        result.emplace_back(cpt);
 //        return result;
 //    }
-//    if (0x10000 <= cp && cp <= 0x10ffff) {
-//        result.emplace_back(0xd800 | ((cp - 0x10000) >> 10));
-//        result.emplace_back(0xdc00 | ((cp - 0x10000) & 0x03ff));
+//    if (0x10000 <= cpt && cpt <= 0x10ffff) {
+//        result.emplace_back(0xd800 | ((cpt - 0x10000) >> 10));
+//        result.emplace_back(0xdc00 | ((cpt - 0x10000) & 0x03ff));
 //        return result;
 //    }
 //    throw std::invalid_argument("failed to convert codepoint to utf16");
@@ -120,8 +120,8 @@ uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
 //    return result;
 //}
 
-static std::vector<codepoint_flags> unicode_cpt_flags_array() {
-    std::vector<codepoint_flags> cpt_flags(MAX_CODEPOINTS, codepoint_flags::UNDEFINED);
+static std::vector<unicode_cpt_flags> unicode_cpt_flags_array() {
+    std::vector<unicode_cpt_flags> cpt_flags(MAX_CODEPOINTS, unicode_cpt_flags::UNDEFINED);
 
     assert (unicode_ranges_flags.begin()[0].first == 0);
     assert (unicode_ranges_flags.begin()[unicode_ranges_flags.size()-1].first == MAX_CODEPOINTS);
@@ -253,8 +253,8 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
             return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
         };
 
-        auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
-            return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{};
+        auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
+            return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt(cpts[pos]) : unicode_cpt_flags{};
         };
 
         size_t _prev_end = offset_ini;
@@ -371,8 +371,8 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
             return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
         };
 
-        auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
-            return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{};
+        auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
+            return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt(cpts[pos]) : unicode_cpt_flags{};
         };
 
         size_t _prev_end = offset_ini;
@@ -572,29 +572,29 @@ static std::vector<size_t> unicode_regex_split_custom(const std::string & text,
 // interface
 //
 
-std::string unicode_cpt_to_utf8(uint32_t cp) {
+std::string unicode_cpt_to_utf8(uint32_t cpt) {
     std::string result;
 
-    if (/* 0x00 <= cp && */ cp <= 0x7f) {
-        result.push_back(cp);
+    if (/* 0x00 <= cpt && */ cpt <= 0x7f) {
+        result.push_back(cpt);
         return result;
     }
-    if (0x80 <= cp && cp <= 0x7ff) {
-        result.push_back(0xc0 | ((cp >> 6) & 0x1f));
-        result.push_back(0x80 | (cp & 0x3f));
+    if (0x80 <= cpt && cpt <= 0x7ff) {
+        result.push_back(0xc0 | ((cpt >> 6) & 0x1f));
+        result.push_back(0x80 | (cpt & 0x3f));
         return result;
     }
-    if (0x800 <= cp && cp <= 0xffff) {
-        result.push_back(0xe0 | ((cp >> 12) & 0x0f));
-        result.push_back(0x80 | ((cp >> 6) & 0x3f));
-        result.push_back(0x80 | (cp & 0x3f));
+    if (0x800 <= cpt && cpt <= 0xffff) {
+        result.push_back(0xe0 | ((cpt >> 12) & 0x0f));
+        result.push_back(0x80 | ((cpt >> 6) & 0x3f));
+        result.push_back(0x80 | (cpt & 0x3f));
         return result;
     }
-    if (0x10000 <= cp && cp <= 0x10ffff) {
-        result.push_back(0xf0 | ((cp >> 18) & 0x07));
-        result.push_back(0x80 | ((cp >> 12) & 0x3f));
-        result.push_back(0x80 | ((cp >> 6) & 0x3f));
-        result.push_back(0x80 | (cp & 0x3f));
+    if (0x10000 <= cpt && cpt <= 0x10ffff) {
+        result.push_back(0xf0 | ((cpt >> 18) & 0x07));
+        result.push_back(0x80 | ((cpt >> 12) & 0x3f));
+        result.push_back(0x80 | ((cpt >> 6) & 0x3f));
+        result.push_back(0x80 | (cpt & 0x3f));
         return result;
     }
 
@@ -624,19 +624,19 @@ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
     return result;
 }
 
-codepoint_flags unicode_cpt_flags(const uint32_t cp) {
-    static const codepoint_flags undef(codepoint_flags::UNDEFINED);
+unicode_cpt_flags unicode_cpt_flags_from_cpt(const uint32_t cpt) {
+    static const unicode_cpt_flags undef(unicode_cpt_flags::UNDEFINED);
     static const auto cpt_flags = unicode_cpt_flags_array();
-    return cp < cpt_flags.size() ? cpt_flags[cp] : undef;
+    return cpt < cpt_flags.size() ? cpt_flags[cpt] : undef;
 }
 
-codepoint_flags unicode_cpt_flags(const std::string & utf8) {
-    static const codepoint_flags undef(codepoint_flags::UNDEFINED);
+unicode_cpt_flags unicode_cpt_flags_from_utf8(const std::string & utf8) {
+    static const unicode_cpt_flags undef(unicode_cpt_flags::UNDEFINED);
     if (utf8.empty()) {
         return undef;  // undefined
     }
     size_t offset = 0;
-    return unicode_cpt_flags(unicode_cpt_from_utf8(utf8, offset));
+    return unicode_cpt_flags_from_cpt(unicode_cpt_from_utf8(utf8, offset));
 }
 
 std::string unicode_byte_to_utf8(uint8_t byte) {
@@ -649,41 +649,41 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8) {
     return map.at(utf8);
 }
 
-uint32_t unicode_tolower(uint32_t cp) {
+uint32_t unicode_tolower(uint32_t cpt) {
     // binary search
-    auto it = std::lower_bound(unicode_map_lowercase.begin(), unicode_map_lowercase.end(), cp,
+    auto it = std::lower_bound(unicode_map_lowercase.begin(), unicode_map_lowercase.end(), cpt,
         [](const std::pair<uint32_t, uint32_t> & pair, uint32_t value) {
             return pair.first < value;
         });
-    if (it != unicode_map_lowercase.end() && it->first == cp) {
+    if (it != unicode_map_lowercase.end() && it->first == cpt) {
         return it->second;
     }
-    return cp;  // Return the original code point if no lowercase mapping is found
+    return cpt;  // Return the original code point if no lowercase mapping is found
 }
 
 std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
     // unicode categories
     static const std::map<std::string, int> k_ucat_enum = {
-        { "\\p{N}", codepoint_flags::NUMBER },
-        { "\\p{L}", codepoint_flags::LETTER },
-        { "\\p{P}", codepoint_flags::PUNCTUATION },
+        { "\\p{N}", unicode_cpt_flags::NUMBER },
+        { "\\p{L}", unicode_cpt_flags::LETTER },
+        { "\\p{P}", unicode_cpt_flags::PUNCTUATION },
     };
 
     static const std::map<int, int> k_ucat_cpt = {
-        { codepoint_flags::NUMBER,        0xD1 },
-        { codepoint_flags::LETTER,        0xD2 },
-        { codepoint_flags::PUNCTUATION,   0xD3 },
+        { unicode_cpt_flags::NUMBER,      0xD1 },
+        { unicode_cpt_flags::LETTER,      0xD2 },
+        { unicode_cpt_flags::PUNCTUATION, 0xD3 },
     };
 
     static const std::map<int, std::string> k_ucat_map = {
-        { codepoint_flags::NUMBER,        "\x30-\x39" }, // 0-9
-        { codepoint_flags::LETTER,        "\x41-\x5A\x61-\x7A" }, // A-Za-z
-        { codepoint_flags::PUNCTUATION,   "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\}
+        { unicode_cpt_flags::NUMBER,      "\x30-\x39" }, // 0-9
+        { unicode_cpt_flags::LETTER,      "\x41-\x5A\x61-\x7A" }, // A-Za-z
+        { unicode_cpt_flags::PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\}
     };
 
     // compute collapsed codepoints only if needed by at least one regex
     bool need_collapse = false;
-    for (auto & regex_expr : regex_exprs) {
+    for (const auto & regex_expr : regex_exprs) {
         // search for unicode categories
         for (const auto & ucat : k_ucat_enum) {
             if (std::string::npos != regex_expr.find(ucat.first)) {
@@ -709,7 +709,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
                 continue;
             }
 
-            const auto flags = unicode_cpt_flags(cpts[i]);
+            const auto flags = unicode_cpt_flags_from_cpt(cpts[i]);
 
             if (flags.is_whitespace) {
                 //NOTE: C++ std::regex \s does not mach 0x85, Rust and Python regex does.
@@ -725,7 +725,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
 
     std::vector<size_t> bpe_offsets = { cpts.size() };
 
-    for (auto & regex_expr : regex_exprs) {
+    for (const auto & regex_expr : regex_exprs) {
         // first, see if we have an efficient custom regex implementation
         auto tmp = unicode_regex_split_custom(text, regex_expr, bpe_offsets);
 
@@ -739,7 +739,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
             // if a unicode category is used in the regex, we use the collapsed text and replace the unicode category
             // with the corresponding collapsed representation
             bool use_collapsed = false;
-            for (auto & ucat : k_ucat_enum) {
+            for (const auto & ucat : k_ucat_enum) {
                 if (std::string::npos != regex_expr.find(ucat.first)) {
                     use_collapsed = true;
                     break;
@@ -805,7 +805,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
                 // std::wregex \s does not mach non-ASCII whitespaces, using 0x0B as fallback
                 std::wstring wtext(cpts.begin(), cpts.end());
                 for (size_t i = 0; i < wtext.size(); ++i) {
-                    if (wtext[i] > 0x7F && unicode_cpt_flags(wtext[i]).is_whitespace) {
+                    if (wtext[i] > 0x7F && unicode_cpt_flags_from_cpt(wtext[i]).is_whitespace) {
                         wtext[i] = 0x0B;
                     }
                 }
diff --git a/src/unicode.h b/src/unicode.h
index 008532a242a..c27098df7d4 100644
--- a/src/unicode.h
+++ b/src/unicode.h
@@ -4,9 +4,7 @@
 #include <string>
 #include <vector>
 
-// TODO: prefix all symbols with "llama_"
-
-struct codepoint_flags {
+struct unicode_cpt_flags {
     enum {
         UNDEFINED       = 0x0001,
         NUMBER          = 0x0002,  // regex: \p{N}
@@ -35,7 +33,7 @@ struct codepoint_flags {
     uint16_t is_nfd         : 1;
 
     // decode from uint16
-    inline codepoint_flags(const uint16_t flags=0) {
+    inline unicode_cpt_flags(const uint16_t flags = 0) {
         *reinterpret_cast<uint16_t*>(this) = flags;
     }
 
@@ -50,18 +48,19 @@ struct codepoint_flags {
 
 size_t unicode_len_utf8(char src);
 
-std::string unicode_cpt_to_utf8(uint32_t cp);
-uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
+std::string unicode_cpt_to_utf8  (uint32_t cpt);
+uint32_t    unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
+
 std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
 
 std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
 
-codepoint_flags unicode_cpt_flags(const uint32_t cp);
-codepoint_flags unicode_cpt_flags(const std::string & utf8);
+unicode_cpt_flags unicode_cpt_flags_from_cpt (uint32_t cpt);
+unicode_cpt_flags unicode_cpt_flags_from_utf8(const std::string & utf8);
 
 std::string unicode_byte_to_utf8(uint8_t byte);
-uint8_t unicode_utf8_to_byte(const std::string & utf8);
+uint8_t     unicode_utf8_to_byte(const std::string & utf8);
 
-uint32_t unicode_tolower(uint32_t cp);
+uint32_t unicode_tolower(uint32_t cpt);
 
 std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 82373ff4e18..2b5e5fd4abe 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -84,39 +84,52 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2             ARGS ${CMAKE
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact            ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
 
-# build test-tokenizer-1-bpe target once and add many tests
-add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp)
-target_link_libraries(test-tokenizer-1-bpe PRIVATE common)
-install(TARGETS test-tokenizer-1-bpe RUNTIME)
-
-# TODO: disabled due to slowness
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-2     ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-neox  ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf --ignore-merges)
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt       ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-refact    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
-
-# build test-tokenizer-1-spm target once and add many tests
-add_executable(test-tokenizer-1-spm test-tokenizer-1-spm.cpp)
-target_link_libraries(test-tokenizer-1-spm PRIVATE common)
-install(TARGETS test-tokenizer-1-spm RUNTIME)
-
-llama_test(test-tokenizer-1-spm  NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
-#llama_test(test-tokenizer-1-spm  NAME test-tokenizer-1-baichuan  ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
-
-# llama_target_and_test(test-double-float.cpp) # SLOW
+
+if (NOT WIN32)
+    # these tests are disabled on Windows because they use internal functions not exported with LLAMA_API
+    llama_target_and_test(test-sampling.cpp)
+    llama_target_and_test(test-grammar-parser.cpp)
+    llama_target_and_test(test-grammar-integration.cpp)
+    llama_target_and_test(test-llama-grammar.cpp)
+    # TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
+    if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
+        llama_target_and_test(test-json-schema-to-grammar.cpp   WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
+        target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../examples/server)
+    endif()
+
+
+    # build test-tokenizer-1-bpe target once and add many tests
+    add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp)
+    target_link_libraries(test-tokenizer-1-bpe PRIVATE common)
+    install(TARGETS test-tokenizer-1-bpe RUNTIME)
+
+    # TODO: disabled due to slowness
+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-2     ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-neox  ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf --ignore-merges)
+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt       ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-refact    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
+
+    # build test-tokenizer-1-spm target once and add many tests
+    add_executable(test-tokenizer-1-spm test-tokenizer-1-spm.cpp)
+    target_link_libraries(test-tokenizer-1-spm PRIVATE common)
+    install(TARGETS test-tokenizer-1-spm RUNTIME)
+
+    llama_test(test-tokenizer-1-spm  NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
+    #llama_test(test-tokenizer-1-spm  NAME test-tokenizer-1-baichuan  ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
+
+    # llama_target_and_test(test-double-float.cpp) # SLOW
+endif()
+
 llama_target_and_test(test-log.cpp)
 llama_target_and_test(test-arg-parser.cpp)
-llama_target_and_test(test-sampling.cpp)
 llama_target_and_test(test-chat-template.cpp)
 
-llama_target_and_test(test-grammar-parser.cpp)
-llama_target_and_test(test-grammar-integration.cpp)
-llama_target_and_test(test-llama-grammar.cpp)
 # llama_target_and_test(test-opt.cpp) # SLOW
+llama_target_and_test(test-gguf.cpp)
 llama_target_and_test(test-backend-ops.cpp)
 
 llama_target_and_test(test-model-load-cancel.cpp  LABEL "model")
@@ -130,11 +143,6 @@ if (NOT GGML_BACKEND_DL)
     llama_target_and_test(test-rope.cpp)
 endif()
 
-# TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
-if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
-    llama_target_and_test(test-json-schema-to-grammar.cpp   WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
-    target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../examples/server)
-endif()
 
 # dummy executable - not installed
 get_filename_component(TEST_TARGET test-c.c NAME_WE)
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 9dd41260a1f..ccdd3fb57a5 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -2201,7 +2201,15 @@ struct test_rope : public test_case {
             ggml_set_name(a, "a");
         }
 
-        ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne_a[2]);
+        const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
+        const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
+
+        ggml_tensor * pos;
+        if (is_mrope || is_vision) {
+            pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne_a[2] * 4);
+        } else {
+            pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne_a[2]);
+        }
         ggml_set_name(pos, "pos");
 
         ggml_tensor * freq = nullptr;
@@ -2210,7 +2218,20 @@ struct test_rope : public test_case {
             ggml_set_name(freq, "freq");
         }
 
-        ggml_tensor * out = ggml_rope_ext(ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
+        ggml_tensor * out;
+        if (is_mrope) {
+            if (is_vision) {
+                GGML_ASSERT(n_dims/4 > 0);
+                int rope_sections[4] = {n_dims/4, n_dims/4, 0, 0}; // Vision-RoPE only use first two dimension for image (x, y) coordinate
+                out = ggml_rope_multi(ctx, a, pos, freq, n_dims/2, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
+            } else {
+                GGML_ASSERT(n_dims/3 > 0);
+                int rope_sections[4] = {n_dims/3, n_dims/3, n_dims/3, 0};
+                out = ggml_rope_multi(ctx, a, pos, freq, n_dims, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
+            }
+        } else {
+            out = ggml_rope_ext(ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
+        }
         ggml_set_name(out, "out");
 
         return out;
@@ -2220,11 +2241,12 @@ struct test_rope : public test_case {
         for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
             if (t->type == GGML_TYPE_I32) {
                 // pos
-                std::vector<int> data(ne_a[2]);
-                for (int i = 0; i < ne_a[2]; i++) {
+                const int num_pos_ids = (mode & GGML_ROPE_TYPE_MROPE) ? ne_a[2] * 4 : ne_a[2];
+                std::vector<int> data(num_pos_ids);
+                for (int i = 0; i < num_pos_ids; i++) {
                     data[i] = rand() % n_ctx;
                 }
-                ggml_backend_tensor_set(t, data.data(), 0, ne_a[2] * sizeof(int));
+                ggml_backend_tensor_set(t, data.data(), 0, num_pos_ids * sizeof(int));
             } else {
                 if (t->ne[0] == n_dims/2) {
                     // frequency factors in the range [0.9f, 1.1f]
@@ -3527,8 +3549,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
 
     for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
         for (ggml_type type_dst : all_types) {
-           test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
-           test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
+            test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
+            test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
         }
     }
     for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
@@ -3813,6 +3835,12 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
                                     test_cases.emplace_back(new test_rope(type, { 80,  32, 2, 1},  32, 2, 512, fs, ef, af, ff, v)); // neox (phi-2)
                                 }
 
+                                if (all) {
+                                    test_cases.emplace_back(new test_rope(type, {128,  12, 2, 1}, 128, GGML_ROPE_TYPE_MROPE,  512, fs, ef, af, ff, v)); // rope_multi,m-rope (qwen2vl 2B)
+                                    test_cases.emplace_back(new test_rope(type, {128,  28, 2, 1}, 128, GGML_ROPE_TYPE_MROPE,  512, fs, ef, af, ff, v)); // rope_multi,m-rope (qwen2vl 7B)
+                                    test_cases.emplace_back(new test_rope(type, { 80,  16, 2, 1},  80, GGML_ROPE_TYPE_VISION, 512, fs, ef, af, ff, v)); // rope_multi,m-rope (qwen2vl ViT)
+                                }
+
                                 test_cases.emplace_back(new test_rope(type, { 64, 128, 2, 1},  64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 40B)
                             }
                         }
diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp
index aa140b5696f..51bfb155b47 100644
--- a/tests/test-chat-template.cpp
+++ b/tests/test-chat-template.cpp
@@ -75,6 +75,10 @@ int main(void) {
         "{%- if messages[0][\"role\"] == \"system\" %}\n    {%- set system_message = messages[0][\"content\"] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n{%- set user_messages = loop_messages | selectattr(\"role\", \"equalto\", \"user\") | list %}\n\n{#- This block checks for alternating user/assistant messages, skipping tool calling messages #}\n{%- set ns = namespace() %}\n{%- set ns.index = 0 %}\n{%- for message in loop_messages %}\n    {%- if not (message.role == \"tool\" or message.role == \"tool_results\" or (message.tool_calls is defined and message.tool_calls is not none)) %}\n        {%- if (message[\"role\"] == \"user\") != (ns.index % 2 == 0) %}\n            {{- raise_exception(\"After the optional system message, conversation roles must alternate user/assistant/user/assistant/...\") }}\n        {%- endif %}\n        {%- set ns.index = ns.index + 1 %}\n    {%- endif %}\n{%- endfor %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n    {%- if message[\"role\"] == \"user\" %}\n        {%- if tools is not none and (message == user_messages[-1]) %}\n            {{- \"[AVAILABLE_TOOLS][\" }}\n            {%- for tool in tools %}\n                {%- set tool = tool.function %}\n                {{- '{\"type\": \"function\", \"function\": {' }}\n                {%- for key, val in tool.items() if key != \"return\" %}\n                    {%- if val is string %}\n                        {{- '\"' + key + '\": \"' + val + '\"' }}\n                    {%- else %}\n                        {{- '\"' + key + '\": ' + val|tojson }}\n                    {%- endif %}\n                    {%- if not loop.last %}\n                        {{- \", \" }}\n                    {%- endif %}\n                {%- endfor %}\n                {{- \"}}\" }}\n                {%- if not loop.last %}\n                    {{- \", \" }}\n                {%- else %}\n                    {{- \"]\" }}\n                {%- endif %}\n            {%- endfor %}\n            {{- \"[/AVAILABLE_TOOLS]\" }}\n            {%- endif %}\n        {%- if loop.last and system_message is defined %}\n            {{- \"[INST]\" + system_message + \"\\n\\n\" + message[\"content\"] + \"[/INST]\" }}\n        {%- else %}\n            {{- \"[INST]\" + message[\"content\"] + \"[/INST]\" }}\n        {%- endif %}\n    {%- elif (message.tool_calls is defined and message.tool_calls is not none) %}\n        {{- \"[TOOL_CALLS][\" }}\n        {%- for tool_call in message.tool_calls %}\n            {%- set out = tool_call.function|tojson %}\n            {{- out[:-1] }}\n            {%- if not tool_call.id is defined or tool_call.id|length != 9 %}\n                {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n            {%- endif %}\n            {{- ', \"id\": \"' + tool_call.id + '\"}' }}\n            {%- if not loop.last %}\n                {{- \", \" }}\n            {%- else %}\n                {{- \"]\" + eos_token }}\n            {%- endif %}\n        {%- endfor %}\n    {%- elif message[\"role\"] == \"assistant\" %}\n        {{- message[\"content\"] + eos_token}}\n    {%- elif message[\"role\"] == \"tool_results\" or message[\"role\"] == \"tool\" %}\n        {%- if message.content is defined and message.content.content is defined %}\n            {%- set content = message.content.content %}\n        {%- else %}\n            {%- set content = message.content %}\n        {%- endif %}\n        {{- '[TOOL_RESULTS]{\"content\": ' + content|string + \", \" }}\n        {%- if not message.tool_call_id is defined or message.tool_call_id|length != 9 %}\n            {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n        {%- endif %}\n        {{- '\"call_id\": \"' + message.tool_call_id + '\"}[/TOOL_RESULTS]' }}\n    {%- else %}\n        {{- raise_exception(\"Only user and assistant roles are supported, with the exception of an initial optional system message!\") }}\n    {%- endif %}\n{%- endfor %}\n",
         // mistralai/Mistral-Large-Instruct-2411 (mistralai 'v7' template)
         "{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + '[/INST]' }}{% elif message['role'] == 'system' %}{{ '[SYSTEM_PROMPT] ' + message['content'] + '[/SYSTEM_PROMPT]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + eos_token }}{% else %}{{ raise_exception('Only user, system and assistant roles are supported!') }}{% endif %}{% endfor %}",
+        // ai-sage/GigaChat-20B-A3B-instruct
+        "{% if messages[0]['role'] == 'system' -%}\n    {%- set loop_messages = messages[1:] -%}\n    {%- set system_message = bos_token + messages[0]['content'] + additional_special_tokens[1] -%}\n{%- else -%}\n    {%- set loop_messages = messages -%}\n    {%- set system_message = bos_token + '' -%}\n{%- endif -%}\n{%- for message in loop_messages %}\n    {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}\n    {% endif %}\n    \n    {%- if loop.index0 == 0 -%}\n        {{ system_message -}}\n    {%- endif -%}\n    {%- if message['role'] == 'user' -%}\n        {{ message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1] -}}\n        {{ 'available functions' + additional_special_tokens[0] + additional_special_tokens[2] + additional_special_tokens[3]  + additional_special_tokens[1] -}}\n    {%- endif -%}\n    {%- if message['role'] == 'assistant' -%}\n        {{ message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1] -}}\n    {%- endif -%}\n    {%- if loop.last and add_generation_prompt -%}\n        {{ 'assistant' + additional_special_tokens[0] -}}\n    {%- endif -%}\n{%- endfor %}",
+        // Infinigence/Megrez-3B-Instruct
+        u8"{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|role_start|>system<|role_end|>你是Megrez-3B-Instruct，将针对用户的问题给出详细的、积极的回答。<|turn_end|>' }}{% endif %}{{ '<|role_start|>' + message['role'] + '<|role_end|>' + message['content'] + '<|turn_end|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|role_start|>assistant<|role_end|>' }}{% endif %}"
     };
     std::vector<std::string> expected_output = {
         // teknium/OpenHermes-2.5-Mistral-7B
@@ -129,6 +133,10 @@ int main(void) {
         "[INST]You are a helpful assistant\n\nHello[/INST]Hi there</s>[INST]Who are you[/INST]   I am an assistant   </s>[INST]Another question[/INST]",
         // mistralai/Mistral-Large-Instruct-2411 (mistralai 'v7' template)
         "[SYSTEM_PROMPT] You are a helpful assistant[/SYSTEM_PROMPT][INST] Hello[/INST] Hi there</s>[INST] Who are you[/INST]    I am an assistant   </s>[INST] Another question[/INST]",
+        // ai-sage/GigaChat-20B-A3B-instruct
+        "<s>You are a helpful assistant<|message_sep|>user<|role_sep|>Hello<|message_sep|>available functions<|role_sep|>[]<|message_sep|>assistant<|role_sep|>Hi there<|message_sep|>user<|role_sep|>Who are you<|message_sep|>available functions<|role_sep|>[]<|message_sep|>assistant<|role_sep|>   I am an assistant   <|message_sep|>user<|role_sep|>Another question<|message_sep|>available functions<|role_sep|>[]<|message_sep|>assistant<|role_sep|>",
+        // Infinigence/Megrez-3B-Instruct
+        "<|role_start|>system<|role_end|>You are a helpful assistant<|turn_end|><|role_start|>user<|role_end|>Hello<|turn_end|><|role_start|>assistant<|role_end|>Hi there<|turn_end|><|role_start|>user<|role_end|>Who are you<|turn_end|><|role_start|>assistant<|role_end|>   I am an assistant   <|turn_end|><|role_start|>user<|role_end|>Another question<|turn_end|><|role_start|>assistant<|role_end|>",
     };
     std::vector<char> formatted_chat(1024);
     int32_t res;
@@ -190,6 +198,7 @@ int main(void) {
     assert(fmt_sys("mistral") == "[INST] You are a helpful assistant\n"); // for old pre-v1 templates
     assert(fmt_sys("gemma")  == ""); // for gemma, system message is merged with user message
     assert(fmt_sys("llama3") == "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|>");
+    assert(fmt_sys("gigachat") == "<s>You are a helpful assistant<|message_sep|>");
 
 
     // test llama_chat_format_single for user message
@@ -214,6 +223,7 @@ int main(void) {
     assert(fmt_single("mistral") == "[INST] How are you [/INST]"); // for old pre-v1 templates
     assert(fmt_single("gemma")  == "\n<start_of_turn>user\nHow are you<end_of_turn>\n<start_of_turn>model\n");
     assert(fmt_single("llama3") == "<|start_header_id|>user<|end_header_id|>\n\nHow are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n");
+    assert(fmt_single("gigachat") == "user<|role_sep|>How are you<|message_sep|>available functions<|role_sep|>[]<|message_sep|>assistant<|role_sep|>");
 
     printf("Test chat templates: OK\n");
 
diff --git a/tests/test-gguf.cpp b/tests/test-gguf.cpp
new file mode 100644
index 00000000000..1bb5fb47c43
--- /dev/null
+++ b/tests/test-gguf.cpp
@@ -0,0 +1,1303 @@
+#include "ggml.h"
+#include "ggml-backend.h"
+#include "../ggml/src/ggml-impl.h"
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+#include <cstdio>
+#include <random>
+#include <string>
+#include <vector>
+
+constexpr int offset_has_kv      = 1000;
+constexpr int offset_has_tensors = 2000;
+constexpr int offset_has_data    = 3000;
+
+enum handcrafted_file_type {
+    HANDCRAFTED_HEADER_BAD_MAGIC          =  10,
+    HANDCRAFTED_HEADER_BAD_VERSION_1      =  20,
+    HANDCRAFTED_HEADER_BAD_VERSION_FUTURE =  30,
+    HANDCRAFTED_HEADER_BAD_N_TENSORS      =  40,
+    HANDCRAFTED_HEADER_BAD_N_KV           =  50,
+    HANDCRAFTED_HEADER_EMPTY              = 800,
+
+    HANDCRAFTED_KV_BAD_KEY_SIZE           =  10 + offset_has_kv,
+    HANDCRAFTED_KV_BAD_TYPE               =  20 + offset_has_kv,
+    HANDCRAFTED_KV_BAD_VALUE_SIZE         =  30 + offset_has_kv,
+    HANDCRAFTED_KV_DUPLICATE_KEY          =  40 + offset_has_kv,
+    HANDCRAFTED_KV_SUCCESS                = 800 + offset_has_kv,
+
+    HANDCRAFTED_TENSORS_BAD_NAME_SIZE     =  10 + offset_has_tensors,
+    HANDCRAFTED_TENSORS_BAD_N_DIMS        =  20 + offset_has_tensors,
+    HANDCRAFTED_TENSORS_BAD_SHAPE         =  30 + offset_has_tensors,
+    HANDCRAFTED_TENSORS_NE_TOO_BIG        =  40 + offset_has_tensors,
+    HANDCRAFTED_TENSORS_BAD_TYPE          =  50 + offset_has_tensors,
+    HANDCRAFTED_TENSORS_BAD_OFFSET        =  60 + offset_has_tensors,
+    HANDCRAFTED_TENSORS_DUPLICATE_NAME    =  70 + offset_has_tensors,
+    HANDCRAFTED_TENSORS_BAD_ALIGNMENT     =  80 + offset_has_tensors,
+    HANDCRAFTED_TENSORS_SUCCESS           = 800 + offset_has_tensors,
+    HANDCRAFTED_TENSORS_CUSTOM_ALIGN      = 810 + offset_has_tensors,
+
+    HANDCRAFTED_DATA_NOT_ENOUGH_DATA      =  10 + offset_has_data,
+    HANDCRAFTED_DATA_BAD_ALIGNMENT        =  20 + offset_has_data,
+    HANDCRAFTED_DATA_SUCCESS              = 800 + offset_has_data,
+    HANDCRAFTED_DATA_CUSTOM_ALIGN         = 810 + offset_has_data,
+};
+
+std::string handcrafted_file_type_name(const enum handcrafted_file_type hft) {
+    switch (hft) {
+        case HANDCRAFTED_HEADER_BAD_MAGIC:          return "HEADER_BAD_MAGIC";
+        case HANDCRAFTED_HEADER_BAD_VERSION_1:      return "HEADER_BAD_VERSION_1";
+        case HANDCRAFTED_HEADER_BAD_VERSION_FUTURE: return "HEADER_BAD_VERSION_FUTURE";
+        case HANDCRAFTED_HEADER_BAD_N_KV:           return "HEADER_BAD_N_KV";
+        case HANDCRAFTED_HEADER_BAD_N_TENSORS:      return "HEADER_BAD_N_TENSORS";
+        case HANDCRAFTED_HEADER_EMPTY:              return "HEADER_EMPTY";
+
+        case HANDCRAFTED_KV_BAD_KEY_SIZE:           return "KV_BAD_KEY_SIZE";
+        case HANDCRAFTED_KV_BAD_TYPE:               return "KV_BAD_TYPE";
+        case HANDCRAFTED_KV_BAD_VALUE_SIZE:         return "KV_BAD_VALUE_SIZE";
+        case HANDCRAFTED_KV_DUPLICATE_KEY:          return "KV_DUPLICATE_KEY";
+        case HANDCRAFTED_KV_SUCCESS:                return "KV_RANDOM_KV";
+
+        case HANDCRAFTED_TENSORS_BAD_NAME_SIZE:     return "TENSORS_BAD_NAME_SIZE";
+        case HANDCRAFTED_TENSORS_BAD_N_DIMS:        return "TENSORS_BAD_N_DIMS";
+        case HANDCRAFTED_TENSORS_BAD_SHAPE:         return "TENSORS_BAD_SHAPE";
+        case HANDCRAFTED_TENSORS_NE_TOO_BIG:        return "TENSORS_NE_TOO_BIG";
+        case HANDCRAFTED_TENSORS_BAD_TYPE:          return "TENSORS_BAD_TYPE";
+        case HANDCRAFTED_TENSORS_BAD_OFFSET:        return "TENSORS_BAD_OFFSET";
+        case HANDCRAFTED_TENSORS_DUPLICATE_NAME:    return "TENSORS_DUPLICATE_NAME";
+        case HANDCRAFTED_TENSORS_BAD_ALIGNMENT:     return "TENSORS_BAD_ALIGNMENT";
+        case HANDCRAFTED_TENSORS_SUCCESS:           return "TENSORS_SUCCESS";
+        case HANDCRAFTED_TENSORS_CUSTOM_ALIGN:      return "TENSORS_CUSTOM_ALIGN";
+
+        case HANDCRAFTED_DATA_NOT_ENOUGH_DATA:      return "DATA_NOT_ENOUGH_DATA";
+        case HANDCRAFTED_DATA_BAD_ALIGNMENT:        return "DATA_BAD_ALIGNMENT";
+        case HANDCRAFTED_DATA_SUCCESS:              return "DATA_SUCCESS";
+        case HANDCRAFTED_DATA_CUSTOM_ALIGN:         return "DATA_CUSTOM_ALIGN";
+    }
+    GGML_ABORT("fatal error");
+}
+
+static bool expect_context_not_null(const enum handcrafted_file_type hft) {
+    if (hft < offset_has_kv) {
+        return hft >= HANDCRAFTED_HEADER_EMPTY;
+    }
+    if (hft < offset_has_tensors) {
+        return hft >= HANDCRAFTED_KV_SUCCESS;
+    }
+    if (hft < offset_has_data) {
+        return hft >= HANDCRAFTED_TENSORS_SUCCESS;
+    }
+    return hft >= HANDCRAFTED_DATA_SUCCESS;
+}
+
+typedef std::pair<enum ggml_type, std::array<int64_t, GGML_MAX_DIMS>> tensor_config_t;
+
+std::vector<tensor_config_t> get_tensor_configs(std::mt19937 & rng) {
+    std::vector<tensor_config_t> tensor_configs;
+    tensor_configs.reserve(100);
+
+    for (int i = 0; i < 100; ++i) {
+        const enum ggml_type type = ggml_type(rng() % GGML_TYPE_COUNT);
+        if (ggml_type_size(type) == 0) {
+            continue;
+        }
+
+        std::array<int64_t, GGML_MAX_DIMS> shape = {1, 1, 1, 1};
+        shape[0] = (1 + rng() % 10) * ggml_blck_size(type);
+        const int n_dims = 1 + rng() % GGML_MAX_DIMS;
+        for (int i = 1; i < n_dims; ++i) {
+            shape[i] = 1 + rng() % 10;
+        }
+
+        tensor_configs.push_back(std::make_pair(type, shape));
+    }
+
+    return tensor_configs;
+}
+
+std::vector<std::pair<enum gguf_type, enum gguf_type>> get_kv_types(std::mt19937 rng) {
+    std::vector<std::pair<enum gguf_type, enum gguf_type>> kv_types;
+    kv_types.reserve(100);
+
+    for (int i = 0; i < 100; ++i) {
+        const gguf_type type = gguf_type(rng() % GGUF_TYPE_COUNT);
+
+        if (type == GGUF_TYPE_ARRAY) {
+            const gguf_type type_arr = gguf_type(rng() % GGUF_TYPE_COUNT);
+            if (type_arr == GGUF_TYPE_ARRAY) {
+                continue;
+            }
+            kv_types.push_back(std::make_pair(type, type_arr));
+            continue;
+        }
+
+        kv_types.push_back(std::make_pair(type, gguf_type(-1)));
+    }
+    std::shuffle(kv_types.begin(), kv_types.end(), rng);
+
+    return kv_types;
+}
+
+static void helper_write(const void * data, const size_t nbytes, FILE * file) {
+    GGML_ASSERT(fwrite(data, 1, nbytes, file) == nbytes);
+}
+
+static FILE * get_handcrafted_file(const unsigned int seed, const enum handcrafted_file_type hft, const int extra_bytes = 0) {
+    FILE * file = tmpfile();
+
+    std::mt19937 rng(seed);
+
+    if (hft == HANDCRAFTED_HEADER_BAD_MAGIC) {
+        const char bad_magic[4] = {'F', 'U', 'G', 'G'};
+        helper_write(bad_magic, sizeof(bad_magic), file);
+    } else {
+        helper_write(GGUF_MAGIC, 4, file);
+    }
+
+    if (hft == HANDCRAFTED_HEADER_BAD_VERSION_1) {
+        const uint32_t version = 1;
+        helper_write(&version, sizeof(version), file);
+    } else if (hft == HANDCRAFTED_HEADER_BAD_VERSION_FUTURE) {
+        const uint32_t version = GGUF_VERSION + 1;
+        helper_write(&version, sizeof(version), file);
+    } else {
+        const uint32_t version = GGUF_VERSION;
+        helper_write(&version, sizeof(version), file);
+    }
+
+    std::vector<tensor_config_t> tensor_configs;
+    if (hft >= offset_has_tensors) {
+        tensor_configs = get_tensor_configs(rng);
+    }
+
+    if (hft == HANDCRAFTED_HEADER_BAD_N_TENSORS) {
+        const uint64_t n_tensors = -1;
+        helper_write(&n_tensors, sizeof(n_tensors), file);
+    } else {
+        const uint64_t n_tensors = tensor_configs.size();
+        helper_write(&n_tensors, sizeof(n_tensors), file);
+    }
+
+    std::vector<std::pair<enum gguf_type, enum gguf_type>> kv_types;
+    if (hft >= offset_has_kv) {
+        kv_types = get_kv_types(rng);
+    }
+    {
+        uint64_t n_kv = kv_types.size();
+        if (hft == HANDCRAFTED_TENSORS_CUSTOM_ALIGN || hft == HANDCRAFTED_DATA_CUSTOM_ALIGN) {
+            n_kv += 1;
+        } else if (hft == HANDCRAFTED_HEADER_BAD_N_KV) {
+            n_kv = -1;
+        }
+        helper_write(&n_kv, sizeof(n_kv), file);
+    }
+
+    if (hft < offset_has_kv) {
+        for (int i = 0; i < extra_bytes; ++i) {
+            const char tmp = 0;
+            helper_write(&tmp, sizeof(tmp), file);
+        }
+        rewind(file);
+        return file;
+    }
+
+    for (int i = 0; i < int(kv_types.size()); ++i) {
+        const enum gguf_type type     = gguf_type(hft == HANDCRAFTED_KV_BAD_TYPE ? -1 : kv_types[i].first);
+        const enum gguf_type type_arr = gguf_type(hft == HANDCRAFTED_KV_BAD_TYPE ? -1 : kv_types[i].second);
+
+        const std::string key = "my_key_" + std::to_string((hft == HANDCRAFTED_KV_DUPLICATE_KEY ? i/2 : i));
+
+        if (hft == HANDCRAFTED_KV_BAD_KEY_SIZE) {
+            const uint64_t n = -1;
+            helper_write(&n, sizeof(n), file);
+        } else {
+            const uint64_t n = key.length();
+            helper_write(&n, sizeof(n), file);
+        }
+        helper_write(key.data(), key.length(), file);
+
+        {
+            const int32_t type32 = int32_t(type);
+            helper_write(&type32, sizeof(type32), file);
+        }
+
+        uint32_t data[16];
+        for (int j = 0; j < 16; ++j) {
+            data[j] = rng();
+            if (type == GGUF_TYPE_STRING || type_arr == GGUF_TYPE_STRING) {
+                data[j] |= 0x01010101; // avoid random null-termination of string
+            }
+        }
+
+        if (type == GGUF_TYPE_STRING) {
+            const uint64_t n = rng() % sizeof(data);
+            helper_write(&n,   sizeof(n), file);
+            helper_write(data,        n,  file);
+            continue;
+        }
+
+        if (type == GGUF_TYPE_ARRAY) {
+            {
+                const int32_t type32 = int32_t(type_arr);
+                helper_write(&type32, sizeof(type32), file);
+            }
+            if (type_arr == GGUF_TYPE_STRING) {
+                const uint64_t nstr = rng() % (16 + 1);
+                helper_write(&nstr, sizeof(nstr), file);
+                for (uint64_t istr = 0; istr < nstr; ++istr) {
+                    const uint64_t n = rng() % (sizeof(uint32_t) + 1);
+                    helper_write(&n,          sizeof(n), file);
+                    helper_write(&data[istr],        n,  file);
+                }
+                continue;
+            }
+            const size_t type_size = gguf_type_size(type_arr);
+            const uint64_t n = (rng() % sizeof(data)) / type_size;
+            helper_write(&n,    sizeof(n),   file);
+            helper_write(&data, n*type_size, file);
+            continue;
+        }
+
+        size_t type_size = hft == HANDCRAFTED_KV_BAD_TYPE ? 1 : gguf_type_size(type);
+        if (hft == HANDCRAFTED_KV_BAD_VALUE_SIZE) {
+            type_size += rng() % 3;
+        }
+        helper_write(data, type_size, file);
+    }
+
+    if (hft == HANDCRAFTED_TENSORS_CUSTOM_ALIGN || hft == HANDCRAFTED_DATA_CUSTOM_ALIGN) {
+        const std::string key = "general.alignment";
+        {
+            const uint64_t n = key.length();
+            helper_write(&n, sizeof(n), file);
+        }
+        helper_write(key.data(), key.length(), file);
+
+        const int32_t type = gguf_type(GGUF_TYPE_UINT32);
+        helper_write(&type, sizeof(type), file);
+
+        const uint32_t alignment = GGUF_DEFAULT_ALIGNMENT + 1;
+        helper_write(&alignment, sizeof(alignment), file);
+    }
+
+    if (hft < offset_has_tensors) {
+        for (int i = 0; i < extra_bytes; ++i) {
+            const char tmp = 0;
+            helper_write(&tmp, sizeof(tmp), file);
+        }
+        rewind(file);
+        return file;
+    }
+
+    uint32_t alignment = GGUF_DEFAULT_ALIGNMENT;
+    if (hft == HANDCRAFTED_TENSORS_BAD_ALIGNMENT || hft == HANDCRAFTED_DATA_BAD_ALIGNMENT) {
+        alignment -= 1;
+    } else if (hft == HANDCRAFTED_TENSORS_CUSTOM_ALIGN || hft == HANDCRAFTED_DATA_CUSTOM_ALIGN) {
+        alignment += 1;
+    }
+
+    uint64_t offset = 0;
+    for (int i = 0; i < int(tensor_configs.size()); ++i) {
+        const ggml_type                          type  = tensor_configs[i].first;
+        const std::array<int64_t, GGML_MAX_DIMS> shape = tensor_configs[i].second;
+
+        std::string name = "my_tensor";
+        if (hft != HANDCRAFTED_TENSORS_DUPLICATE_NAME) {
+            name += "_" + std::to_string(i);
+        }
+        if (hft == HANDCRAFTED_TENSORS_BAD_NAME_SIZE) {
+            name += "_with_a_very_long_name_which_is_longer_than_what_is_allowed_for_ggml_tensors";
+            GGML_ASSERT(name.length() >= GGML_MAX_NAME);
+        }
+        {
+            const uint64_t n = name.length();
+            helper_write(&n, sizeof(n), file);
+        }
+        helper_write(name.data(), name.length(), file);
+
+        uint32_t n_dims = hft == HANDCRAFTED_TENSORS_NE_TOO_BIG ? 2 : 1;
+        for (int i = GGML_MAX_DIMS-1; i >= 1; --i) {
+            if (shape[i] != 1) {
+                n_dims = i + 1;
+                break;
+            }
+        }
+        if (hft == HANDCRAFTED_TENSORS_BAD_N_DIMS) {
+            const uint32_t n_dims_bad = GGML_MAX_DIMS + 1;
+            helper_write(&n_dims_bad, sizeof(n_dims_bad), file);
+        } else {
+            helper_write(&n_dims,     sizeof(n_dims),     file);
+        }
+
+        if (hft == HANDCRAFTED_TENSORS_BAD_SHAPE) {
+            for (uint32_t j = 0; j < n_dims; ++j) {
+                const int64_t bad_dim = -1;
+                helper_write(&bad_dim, sizeof(bad_dim), file);
+            }
+        } else if (hft == HANDCRAFTED_TENSORS_NE_TOO_BIG){
+            for (uint32_t j = 0; j < n_dims; ++j) {
+                const int64_t big_dim = 4*int64_t(INT32_MAX);
+                helper_write(&big_dim, sizeof(big_dim), file);
+            }
+        } else {
+            helper_write(shape.data(), n_dims*sizeof(int64_t), file);
+        }
+
+        {
+            const int32_t type32 = hft == HANDCRAFTED_TENSORS_BAD_TYPE ? -1 : int32_t(type);
+            helper_write(&type32, sizeof(type32), file);
+        }
+
+        if (hft == HANDCRAFTED_TENSORS_BAD_OFFSET) {
+            const uint64_t bad_offset = -1;
+            helper_write(&bad_offset, sizeof(bad_offset), file);
+        } else {
+            helper_write(&offset, sizeof(offset), file);
+        }
+
+        int64_t ne = shape[0];
+        for (uint32_t i = 1; i < n_dims; ++i) {
+            ne *= shape[i];
+        }
+        offset += GGML_PAD(ggml_row_size(type, ne), alignment);
+    }
+
+    const uint32_t alignment_overshoot = ftell(file) % alignment;
+    if (alignment_overshoot != 0) {
+        for (size_t i = alignment_overshoot; i < alignment; ++i) {
+            const char pad = 0;
+            helper_write(&pad, sizeof(pad), file);
+        }
+    }
+
+    if (hft >= offset_has_data) {
+        rng.seed(seed + 1);
+        uint64_t nbytes = offset;
+        if (hft == HANDCRAFTED_DATA_NOT_ENOUGH_DATA) {
+            nbytes -= 1;
+        }
+        for (uint64_t i = 0; i < nbytes; ++i) {
+            const uint8_t random_byte = i % 256;
+            helper_write(&random_byte, sizeof(random_byte), file);
+        }
+    }
+
+    for (int i = 0; i < extra_bytes; ++i) {
+        const char tmp = 0;
+        helper_write(&tmp, sizeof(tmp), file);
+    }
+    rewind(file);
+    return file;
+}
+
+static bool handcrafted_check_header(const gguf_context * gguf_ctx, const unsigned int seed, const bool has_kv, const bool has_tensors, const bool alignment_defined) {
+    if (!gguf_ctx) {
+        return false;
+    }
+
+    std::mt19937 rng(seed);
+
+    std::vector<tensor_config_t> tensor_configs;
+    if (has_tensors) {
+        tensor_configs = get_tensor_configs(rng);
+    }
+    std::vector<std::pair<enum gguf_type, enum gguf_type>> kv_types;
+    if (has_kv) {
+        kv_types = get_kv_types(rng);
+    }
+
+    bool ok = true;
+
+    if (gguf_get_version(gguf_ctx) != GGUF_VERSION) {
+        ok = false;
+    }
+    if (gguf_get_n_tensors(gguf_ctx) != int(tensor_configs.size())) {
+        ok = false;
+    }
+    if (gguf_get_n_kv(gguf_ctx) != int(alignment_defined ? kv_types.size() + 1 : kv_types.size())) {
+        ok = false;
+    }
+
+    return ok;
+}
+
+static bool handcrafted_check_kv(const gguf_context * gguf_ctx, const unsigned int seed, const bool has_tensors, const bool alignment_defined) {
+    if (!gguf_ctx) {
+        return false;
+    }
+
+    std::mt19937 rng(seed);
+
+    std::vector<tensor_config_t> tensor_configs;
+    if (has_tensors) {
+        tensor_configs = get_tensor_configs(rng);
+    }
+
+    std::vector<std::pair<enum gguf_type, enum gguf_type>> kv_types = get_kv_types(rng);
+
+    bool ok = true;
+
+    for (int i = 0; i < int(kv_types.size()); ++i) {
+        const enum gguf_type type     = gguf_type(kv_types[i].first);
+        const enum gguf_type type_arr = gguf_type(kv_types[i].second);
+
+        const std::string key = "my_key_" + std::to_string(i);
+
+        uint32_t data[16];
+        for (int j = 0; j < 16; ++j) {
+            data[j] = rng();
+            if (type == GGUF_TYPE_STRING || type_arr == GGUF_TYPE_STRING) {
+                data[j] |= 0x01010101; // avoid random null-termination of string
+            }
+        }
+
+        const char * data8 = reinterpret_cast<const char *>(data);
+        const int id = gguf_find_key(gguf_ctx, key.c_str());
+
+        if (type == GGUF_TYPE_STRING) {
+            const char * str = gguf_get_val_str(gguf_ctx, id);
+            const uint64_t n = strlen(str);
+            const uint64_t n_expected = rng() % sizeof(data);
+            if (n != n_expected) {
+                ok = false;
+                continue;
+            }
+            if (!std::equal(str, str + n, data8)) {
+                ok = false;
+            }
+            continue;
+        }
+
+        if (type == GGUF_TYPE_ARRAY) {
+            const size_t type_size = gguf_type_size(type_arr);
+            const uint64_t arr_n = gguf_get_arr_n(gguf_ctx, id);
+
+            if (type_arr == GGUF_TYPE_STRING) {
+                const uint64_t nstr_expected = rng() % (16 + 1);
+                if (arr_n != nstr_expected) {
+                    ok = false;
+                    continue;
+                }
+                for (uint64_t istr = 0; istr < nstr_expected; ++istr) {
+                    const char * str = gguf_get_arr_str(gguf_ctx, id, istr);
+                    const uint64_t n = strlen(str);
+                    const uint64_t n_expected = rng() % (sizeof(uint32_t) + 1);
+
+                    if (n != n_expected) {
+                        ok = false;
+                        continue;
+                    }
+                    const char * str_expected = reinterpret_cast<const char *>(&data[istr]);
+                    if (strncmp(str, str_expected, n) != 0) {
+                        ok = false;
+                        continue;
+                    }
+                }
+                continue;
+            }
+
+            const uint64_t arr_n_expected = (rng() % sizeof(data)) / type_size;
+            if (arr_n != arr_n_expected) {
+                ok = false;
+                continue;
+            }
+
+            const char * data_gguf = reinterpret_cast<const char *>(gguf_get_arr_data(gguf_ctx, id));
+            if (!std::equal(data8, data8 + arr_n*type_size, data_gguf)) {
+                ok = false;
+            }
+            continue;
+        }
+
+        const char * data_gguf = reinterpret_cast<const char *>(gguf_get_val_data(gguf_ctx, id));
+        if (!std::equal(data8, data8 + gguf_type_size(type), data_gguf)) {
+            ok = false;
+        }
+    }
+
+    const uint32_t expected_alignment = alignment_defined ? GGUF_DEFAULT_ALIGNMENT + 1 : GGUF_DEFAULT_ALIGNMENT;
+    if (gguf_get_alignment(gguf_ctx) != expected_alignment) {
+        ok = false;
+    }
+
+    return ok;
+}
+
+static bool handcrafted_check_tensors(const gguf_context * gguf_ctx, const unsigned int seed) {
+    if (!gguf_ctx) {
+        return false;
+    }
+
+    std::mt19937 rng(seed);
+
+    std::vector<tensor_config_t> tensor_configs = get_tensor_configs(rng);
+
+    // Call get_kv_types to get the same RNG state:
+    get_kv_types(rng);
+
+    bool ok = true;
+
+    const int id_alignment = gguf_find_key(gguf_ctx, "general.alignment");
+    const uint32_t alignment = id_alignment >= 0 ? gguf_get_val_u32(gguf_ctx, id_alignment) : GGUF_DEFAULT_ALIGNMENT;
+
+    uint64_t expected_offset = 0;
+    for (int i = 0; i < int(tensor_configs.size()); ++i) {
+        const ggml_type                          type  = tensor_configs[i].first;
+        const std::array<int64_t, GGML_MAX_DIMS> shape = tensor_configs[i].second;
+
+        const std::string name = "my_tensor_" + std::to_string(i);
+        const int id = gguf_find_tensor(gguf_ctx, name.c_str());
+
+        if (id >= 0) {
+            if (std::string(gguf_get_tensor_name(gguf_ctx, id)) != name) {
+                ok = false;
+            }
+
+            if (gguf_get_tensor_type(gguf_ctx, id) != type) {
+                ok = false;
+            }
+        } else {
+            ok = false;
+            continue;
+        }
+
+        const size_t offset = gguf_get_tensor_offset(gguf_ctx, id);
+
+        if (offset != expected_offset) {
+            ok = false;
+        }
+
+        int64_t ne = shape[0];
+        for (size_t j = 1; j < GGML_MAX_DIMS; ++j) {
+            ne *= shape[j];
+        }
+        expected_offset += GGML_PAD(ggml_row_size(type, ne), alignment);
+    }
+
+    return ok;
+}
+
+static bool handcrafted_check_tensor_data(const gguf_context * gguf_ctx, const unsigned int seed, FILE * file) {
+    if (!gguf_ctx) {
+        return false;
+    }
+
+    std::mt19937 rng(seed);
+
+    std::vector<tensor_config_t> tensor_configs = get_tensor_configs(rng);
+
+    bool ok = true;
+
+    const uint32_t alignment = GGUF_DEFAULT_ALIGNMENT;
+
+    for (int i = 0; i < int(tensor_configs.size()); ++i) {
+        const ggml_type                          type  = tensor_configs[i].first;
+        const std::array<int64_t, GGML_MAX_DIMS> shape = tensor_configs[i].second;
+
+        int64_t ne = shape[0];
+        for (size_t j = 1; j < GGML_MAX_DIMS; ++j) {
+            ne *= shape[j];
+        }
+        const size_t size = ggml_row_size(type, ne);
+
+        const std::string name = "my_tensor_" + std::to_string(i);
+        const size_t offset = gguf_get_tensor_offset(gguf_ctx, gguf_find_tensor(gguf_ctx, name.c_str()));
+
+        std::vector<uint8_t> data(size);
+        GGML_ASSERT(fseek(file, gguf_get_data_offset(gguf_ctx) + offset, SEEK_SET) == 0);
+        GGML_ASSERT(fread(data.data(), 1, size, file) == size);
+
+        for (size_t j = 0; j < size; ++j) {
+            const uint8_t expected_byte = (j + offset) % 256;
+            if (data[j] != expected_byte) {
+                ok = false;
+            }
+        }
+    }
+
+    return ok;
+}
+
+static std::pair<int, int> test_handcrafted_file(const unsigned int seed) {
+    int npass = 0;
+    int ntest = 0;
+
+    const std::vector<handcrafted_file_type> hfts = {
+        HANDCRAFTED_HEADER_BAD_MAGIC,
+        HANDCRAFTED_HEADER_BAD_VERSION_1,
+        // HANDCRAFTED_FILE_TYPE_BAD_VERSION_FUTURE, // FIXME
+        HANDCRAFTED_HEADER_BAD_N_KV,
+        HANDCRAFTED_HEADER_BAD_N_TENSORS,
+        HANDCRAFTED_HEADER_EMPTY,
+
+        HANDCRAFTED_KV_BAD_KEY_SIZE,
+        HANDCRAFTED_KV_BAD_TYPE,
+        // HANDCRAFTED_KV_BAD_VALUE_SIZE, // FIXME sanitizer limit
+        // HANDCRAFTED_FILE_TYPE_DUPLICATE_KEY, // FIXME
+        HANDCRAFTED_KV_SUCCESS,
+
+        HANDCRAFTED_TENSORS_BAD_NAME_SIZE,
+        HANDCRAFTED_TENSORS_BAD_N_DIMS,
+        HANDCRAFTED_TENSORS_BAD_SHAPE,
+        HANDCRAFTED_TENSORS_NE_TOO_BIG,
+        HANDCRAFTED_TENSORS_BAD_TYPE,
+        // HANDCRAFTED_TENSORS_BAD_OFFSET, // FIXME
+        HANDCRAFTED_TENSORS_DUPLICATE_NAME,
+        // HANDCRAFTED_TENSORS_BAD_ALIGNMENT, // FIXME
+        HANDCRAFTED_TENSORS_SUCCESS,
+        HANDCRAFTED_TENSORS_CUSTOM_ALIGN,
+
+        HANDCRAFTED_DATA_NOT_ENOUGH_DATA,
+        // HANDCRAFTED_DATA_BAD_ALIGNMENT, // FIXME
+        HANDCRAFTED_DATA_SUCCESS,
+        HANDCRAFTED_DATA_CUSTOM_ALIGN,
+    };
+
+    for (enum handcrafted_file_type hft : hfts) {
+        printf("%s: handcrafted_file_type=%s\n", __func__, handcrafted_file_type_name(hft).c_str());
+        FILE * file = get_handcrafted_file(seed, hft);
+
+#ifdef _WIN32
+        if (!file) {
+            printf("%s: failed to create tmpfile(), needs elevated privileges on Windows");
+            printf("%s: skipping tests");
+            continue;
+        }
+#else
+        GGML_ASSERT(file);
+#endif // _WIN32
+
+        struct ggml_context * ctx = nullptr;
+        struct gguf_init_params gguf_params = {
+            /*no_alloc =*/ false,
+            /*ctx      =*/ hft >= offset_has_data ? &ctx : nullptr,
+        };
+        struct gguf_context * gguf_ctx = gguf_init_from_file_impl(file, gguf_params);
+
+        if (expect_context_not_null(hft)) {
+            printf("%s:   - context_not_null: ", __func__);
+        } else {
+            printf("%s:   - context_null: ", __func__);
+        }
+        if (bool(gguf_ctx) == expect_context_not_null(hft)) {
+            printf("\033[1;32mOK\033[0m\n");
+            npass++;
+        } else {
+            printf("\033[1;31mFAIL\033[0m\n");
+        }
+        ntest++;
+
+        if (false && hft >= offset_has_data && !expect_context_not_null(hft)) { // FIXME
+            printf("%s:   - no_dangling_ggml_context_pointer: ", __func__);
+            if (ctx) {
+                printf("\033[1;31mFAIL\033[0m\n");
+            } else {
+                printf("\033[1;32mOK\033[0m\n");
+                npass++;
+            }
+            ntest++;
+        }
+
+        if (false && expect_context_not_null(hft)) { // FIXME
+            FILE * file_eb = get_handcrafted_file(seed, hft, /*extra_bytes =*/ 1);
+            struct gguf_context * gguf_ctx_eb = gguf_init_from_file_impl(file_eb, gguf_params);
+
+            printf("%s:   - context_null_with_extra_bytes: ", __func__);
+            if (gguf_ctx_eb) {
+                printf("\033[1;31mFAIL\033[0m\n");
+            } else {
+                printf("\033[1;32mOK\033[0m\n");
+                npass++;
+            }
+            ntest++;
+
+            gguf_free(gguf_ctx_eb);
+            fclose(file_eb);
+        }
+
+        const bool alignment_defined = hft == HANDCRAFTED_TENSORS_CUSTOM_ALIGN || hft == HANDCRAFTED_DATA_CUSTOM_ALIGN;
+
+        if (expect_context_not_null(hft)) {
+            printf("%s:   - check_header: ", __func__);
+            if (handcrafted_check_header(gguf_ctx, seed, hft >= offset_has_kv, hft >= offset_has_tensors, alignment_defined)) {
+                printf("\033[1;32mOK\033[0m\n");
+                npass++;
+            } else {
+                printf("\033[1;31mFAIL\033[0m\n");
+            }
+            ntest++;
+        }
+
+        if (expect_context_not_null(hft) && hft >= offset_has_kv) {
+            printf("%s:   - check_kv: ", __func__);
+            if (handcrafted_check_kv(gguf_ctx, seed, hft >= offset_has_tensors, alignment_defined)) {
+                printf("\033[1;32mOK\033[0m\n");
+                npass++;
+            } else {
+                printf("\033[1;31mFAIL\033[0m\n");
+            }
+            ntest++;
+        }
+
+        if (expect_context_not_null(hft) && hft >= offset_has_tensors) {
+            printf("%s:   - check_tensors: ", __func__);
+            if (handcrafted_check_tensors(gguf_ctx, seed)) {
+                printf("\033[1;32mOK\033[0m\n");
+                npass++;
+            } else {
+                printf("\033[1;31mFAIL\033[0m\n");
+            }
+            ntest++;
+        }
+
+        if (expect_context_not_null(hft) && hft >= offset_has_data) {
+            printf("%s:   - check_tensor_data: ", __func__);
+            if (handcrafted_check_tensor_data(gguf_ctx, seed, file)) {
+                printf("\033[1;32mOK\033[0m\n");
+                npass++;
+            } else {
+                printf("\033[1;31mFAIL\033[0m\n");
+            }
+            ntest++;
+        }
+
+        if (gguf_ctx) {
+            ggml_free(ctx);
+            gguf_free(gguf_ctx);
+        }
+        fclose(file);
+        printf("\n");
+    }
+
+    return std::make_pair(npass, ntest);
+}
+
+struct random_gguf_context_result {
+    struct gguf_context * gguf_ctx;
+    struct ggml_context * ctx;
+    ggml_backend_buffer_t buffer;
+};
+
+static struct random_gguf_context_result get_random_gguf_context(ggml_backend_t backend, const unsigned int seed) {
+    std::mt19937 rng(seed);
+
+    struct gguf_context * gguf_ctx = gguf_init_empty();
+
+    for (int i = 0; i < 256; ++i) {
+        const std::string key = "my_key_" + std::to_string(rng() % 1024);
+        const enum gguf_type type = gguf_type(rng() % GGUF_TYPE_COUNT);
+
+        if (type == GGUF_TYPE_STRING || type == GGUF_TYPE_ARRAY) {
+            continue; // FIXME memory leak
+        }
+
+        switch (type) {
+            case GGUF_TYPE_UINT8:   gguf_set_val_u8  (gguf_ctx, key.c_str(), rng() % (1 <<  7));             break;
+            case GGUF_TYPE_INT8:    gguf_set_val_i8  (gguf_ctx, key.c_str(), rng() % (1 <<  7) - (1 <<  6)); break;
+            case GGUF_TYPE_UINT16:  gguf_set_val_u16 (gguf_ctx, key.c_str(), rng() % (1 << 15));             break;
+            case GGUF_TYPE_INT16:   gguf_set_val_i16 (gguf_ctx, key.c_str(), rng() % (1 << 15) - (1 << 14)); break;
+            case GGUF_TYPE_UINT32:  gguf_set_val_u32 (gguf_ctx, key.c_str(), rng());                         break;
+            case GGUF_TYPE_INT32:   gguf_set_val_i32 (gguf_ctx, key.c_str(), rng()             - (1 << 30)); break;
+            case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (gguf_ctx, key.c_str(), rng() % 1024      - 512);       break;
+            case GGUF_TYPE_BOOL:    gguf_set_val_bool(gguf_ctx, key.c_str(), rng() % 2 == 0);                break;
+            case GGUF_TYPE_STRING:  gguf_set_val_str (gguf_ctx, key.c_str(), std::to_string(rng()).c_str()); break;
+            case GGUF_TYPE_UINT64:  gguf_set_val_u64 (gguf_ctx, key.c_str(), rng());                         break;
+            case GGUF_TYPE_INT64:   gguf_set_val_i64 (gguf_ctx, key.c_str(), rng()             - (1 << 30)); break;
+            case GGUF_TYPE_FLOAT64: gguf_set_val_f32 (gguf_ctx, key.c_str(), rng() % 1024      - 512);       break;
+            case GGUF_TYPE_ARRAY: {
+                const enum gguf_type type_arr = gguf_type(rng() % GGUF_TYPE_COUNT);
+                const uint64_t ne = rng() % 1024;
+
+                switch (type_arr) {
+                    case GGUF_TYPE_UINT8:
+                    case GGUF_TYPE_INT8:
+                    case GGUF_TYPE_UINT16:
+                    case GGUF_TYPE_INT16:
+                    case GGUF_TYPE_UINT32:
+                    case GGUF_TYPE_INT32:
+                    case GGUF_TYPE_FLOAT32:
+                    case GGUF_TYPE_BOOL:
+                    case GGUF_TYPE_UINT64:
+                    case GGUF_TYPE_INT64:
+                    case GGUF_TYPE_FLOAT64: {
+                        const size_t nbytes = ne*gguf_type_size(type_arr);
+                        std::vector<uint32_t> random_data((nbytes + sizeof(uint32_t) - 1) / sizeof(uint32_t));
+                        for (size_t j = 0; j < random_data.size(); ++j) {
+                            random_data[j] = rng();
+                        }
+                        gguf_set_arr_data(gguf_ctx, key.c_str(), type_arr, random_data.data(), ne);
+                    } break;
+                    case GGUF_TYPE_STRING: {
+                        std::vector<std::string>  data_cpp(ne);
+                        std::vector<const char *> data_c(ne);
+                        for (size_t j = 0; j < data_cpp.size(); ++j) {
+                            data_cpp[j] = std::to_string(rng());
+                            data_c[j]   = data_cpp[j].c_str();
+                        }
+                        gguf_set_arr_str(gguf_ctx, key.c_str(), data_c.data(), ne);
+                    } break;
+                    case GGUF_TYPE_ARRAY: {
+                        break; // not supported
+                    }
+                    case GGUF_TYPE_COUNT:
+                    default: {
+                        GGML_ABORT("fatal error");
+                    } break;
+                }
+            } break;
+            case GGUF_TYPE_COUNT:
+            default: {
+                GGML_ABORT("fatal error");
+            } break;
+        }
+    }
+
+    struct ggml_init_params ggml_params = {
+        /*.mem_size   =*/ 256*ggml_tensor_overhead(),
+        /*.mem_buffer =*/ nullptr,
+        /*.no_alloc   =*/ true,
+    };
+    struct ggml_context * ctx = ggml_init(ggml_params);
+
+    for (int i = 0; i < 256; ++i) {
+        const std::string name = "my_tensor_" + std::to_string(i);
+        const enum ggml_type type = ggml_type(rng() % GGML_TYPE_COUNT);
+        const size_t type_size = ggml_type_size(type);
+
+        if (type_size == 0) {
+            continue;
+        }
+
+        const int n_dims = 1 + rng() % GGML_MAX_DIMS;
+        int64_t ne[GGML_MAX_DIMS];
+        ne[0] = (1 + rng() % 10) * ggml_blck_size(type);
+        for (int j = 1; j < n_dims; ++j) {
+            ne[j] = 1 + rng() % 10;
+        }
+
+        struct ggml_tensor * tensor = ggml_new_tensor(ctx, type, n_dims, ne);
+        ggml_set_name(tensor, name.c_str());
+    }
+
+    ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend);
+    for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
+        const size_t nbytes = ggml_nbytes(t);
+        std::vector<uint32_t> random_data((nbytes + sizeof(uint32_t) - 1) / sizeof(uint32_t));
+        for (size_t j = 0; j < random_data.size(); ++j) {
+            random_data[j] = rng();
+        }
+        ggml_backend_tensor_set(t, random_data.data(), 0, nbytes);
+
+        gguf_add_tensor(gguf_ctx, t);
+    }
+
+    return {gguf_ctx, ctx, buf};
+}
+
+static bool all_kv_in_other(const gguf_context * ctx, const gguf_context * other) {
+    bool ok = true;
+
+    const int n_kv = gguf_get_n_kv(ctx);
+    for (int id = 0; id < n_kv; ++id) {
+        const char * name = gguf_get_key(ctx, id);
+
+        const int idx_other = gguf_find_key(other, name);
+        if (idx_other < 0) {
+            ok = false;
+            continue;
+        }
+
+        const gguf_type type = gguf_get_kv_type(ctx, id);
+        if (type != gguf_get_kv_type(other, idx_other)) {
+            ok = false;
+            continue;
+        }
+
+        if (type == GGUF_TYPE_ARRAY) {
+            const int arr_n = gguf_get_arr_n(ctx, id);
+            if (arr_n != gguf_get_arr_n(other, idx_other)) {
+                ok = false;
+                continue;
+            }
+
+            const gguf_type type_arr = gguf_get_arr_type(ctx, id);
+            if (type_arr != gguf_get_arr_type(other, idx_other)) {
+                ok = false;
+                continue;
+            }
+
+            if (type_arr == GGUF_TYPE_STRING) {
+                for (int arr_i = 0; arr_i < arr_n; ++arr_i) {
+                    const std::string str       = gguf_get_arr_str(ctx,   id,       arr_i);
+                    const std::string str_other = gguf_get_arr_str(other, idx_other, arr_i);
+                    if (str != str_other) {
+                        ok = false;
+                    }
+                }
+                continue;
+            }
+
+            const char * data       = reinterpret_cast<const char *>(gguf_get_arr_data(ctx,   id));
+            const char * data_other = reinterpret_cast<const char *>(gguf_get_arr_data(other, idx_other));
+            if (!std::equal(data, data + arr_n*gguf_type_size(type_arr), data_other)) {
+                ok = false;
+            }
+            continue;
+        }
+
+        if (type == GGUF_TYPE_STRING) {
+            const std::string str       = gguf_get_val_str(ctx,   id);
+            const std::string str_other = gguf_get_val_str(other, idx_other);
+            if (str != str_other) {
+                ok = false;
+            }
+            continue;
+        }
+
+        const char * data       = reinterpret_cast<const char *>(gguf_get_val_data(ctx,   id));
+        const char * data_other = reinterpret_cast<const char *>(gguf_get_val_data(other, idx_other));
+        if (!std::equal(data, data + gguf_type_size(type), data_other)) {
+            ok = false;
+        }
+    }
+
+    return ok;
+}
+
+static bool all_tensors_in_other(const gguf_context * ctx, const gguf_context * other) {
+    bool ok = true;
+
+    const int n_tensors = gguf_get_n_tensors(ctx);
+    for (int id = 0; id < n_tensors; ++id) {
+        const std::string name = gguf_get_tensor_name(ctx, id);
+
+        const int idx_other = gguf_find_tensor(other, name.c_str());
+        if (id != idx_other) {
+            ok = false;
+            if (idx_other < 0) {
+                continue;
+            }
+        }
+
+        const ggml_type type = gguf_get_tensor_type(ctx, id);
+        if (type != gguf_get_tensor_type(other, id)) {
+            ok = false;
+        }
+
+        const size_t offset = gguf_get_tensor_offset(ctx, id);
+        if (offset != gguf_get_tensor_offset(other, id)) {
+            ok = false;
+        }
+    }
+
+    return ok;
+}
+
+static bool same_tensor_data(const struct ggml_context * orig, const struct ggml_context * read) {
+    bool ok = true;
+
+    struct ggml_tensor * t_orig = ggml_get_first_tensor(orig);
+    struct ggml_tensor * t_read = ggml_get_first_tensor(read);
+    while (t_orig) {
+        if (!t_read) {
+            ok = false;
+            break;
+        }
+
+        const size_t nbytes = ggml_nbytes(t_orig);
+        if (ggml_nbytes(t_read) != nbytes) {
+            ok = false;
+            break;
+        }
+        std::vector<char> data_orig(nbytes);
+        ggml_backend_tensor_get(t_orig, data_orig.data(), 0, nbytes);
+        if (!std::equal(data_orig.data(), data_orig.data() + nbytes, reinterpret_cast<const char *>(t_read->data))) {
+            ok = false;
+        }
+
+        t_orig = ggml_get_next_tensor(orig, t_orig);
+        t_read = ggml_get_next_tensor(orig, t_read);
+    }
+    if (t_read) {
+        ok = false;
+    }
+
+    return true;
+}
+
+static std::pair<int, int> test_roundtrip(ggml_backend_dev_t dev, const unsigned int seed, const bool only_meta) {
+    FILE * file = tmpfile();
+#ifdef _WIN32
+    if (!file) {
+        printf("%s: failed to create tmpfile(), needs elevated privileges on Windows");
+        printf("%s: skipping tests");
+        return std::make_pair(0, 0);
+    }
+#else
+    GGML_ASSERT(file);
+#endif // _WIN32
+
+    if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) {
+        return std::make_pair(0, 0); // FIXME
+    }
+
+    ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
+    printf("%s: device=%s, backend=%s, only_meta=%s\n",
+        __func__, ggml_backend_dev_description(dev), ggml_backend_name(backend), only_meta ? "yes" : "no");
+
+    int npass = 0;
+    int ntest = 0;
+
+    struct gguf_context * gguf_ctx_0;
+    struct ggml_context * ctx_0;
+    ggml_backend_buffer_t bbuf;
+    {
+        struct random_gguf_context_result result = get_random_gguf_context(backend, seed);
+        gguf_ctx_0 = result.gguf_ctx;
+        ctx_0      = result.ctx;
+        bbuf       = result.buffer;
+    }
+
+    struct gguf_buf gbuf = gguf_buf_init(16 * 1024);
+    gguf_write_to_buf(gguf_ctx_0, &gbuf, only_meta);
+    helper_write(gbuf.data, gbuf.offset, file);
+    rewind(file);
+
+    struct ggml_context * ctx_1 = nullptr;
+    struct gguf_init_params gguf_params = {
+        /*no_alloc =*/ false,
+        /*ctx      =*/ only_meta ? nullptr : &ctx_1,
+    };
+    struct gguf_context * gguf_ctx_1 = gguf_init_from_file_impl(file, gguf_params);
+
+    printf("%s: same_version: ", __func__);
+    if (gguf_get_version(gguf_ctx_0) == gguf_get_version(gguf_ctx_1)) {
+        printf("\033[1;32mOK\033[0m\n");
+        npass++;
+    } else {
+        printf("\033[1;31mFAIL\033[0m\n");
+    }
+    ntest++;
+
+    printf("%s: same_n_kv: ", __func__);
+    if (gguf_get_n_kv(gguf_ctx_0) == gguf_get_n_kv(gguf_ctx_1)) {
+        printf("\033[1;32mOK\033[0m\n");
+        npass++;
+    } else {
+        printf("\033[1;31mFAIL\033[0m\n");
+    }
+    ntest++;
+
+    printf("%s: same_n_tensors: ", __func__);
+    if (gguf_get_n_tensors(gguf_ctx_0) == gguf_get_n_tensors(gguf_ctx_1)) {
+        printf("\033[1;32mOK\033[0m\n");
+        npass++;
+    } else {
+        printf("\033[1;31mFAIL\033[0m\n");
+    }
+    ntest++;
+
+    printf("%s: all_orig_kv_in_read: ", __func__);
+    if (all_kv_in_other(gguf_ctx_0, gguf_ctx_1)) {
+        printf("\033[1;32mOK\033[0m\n");
+        npass++;
+    } else {
+        printf("\033[1;31mFAIL\033[0m\n");
+    }
+    ntest++;
+
+    printf("%s: all_read_kv_in_orig: ", __func__);
+    if (all_kv_in_other(gguf_ctx_1, gguf_ctx_0)) {
+        printf("\033[1;32mOK\033[0m\n");
+        npass++;
+    } else {
+        printf("\033[1;31mFAIL\033[0m\n");
+    }
+    ntest++;
+
+    printf("%s: all_orig_tensors_in_read: ", __func__);
+    if (all_tensors_in_other(gguf_ctx_0, gguf_ctx_1)) {
+        printf("\033[1;32mOK\033[0m\n");
+        npass++;
+    } else {
+        printf("\033[1;31mFAIL\033[0m\n");
+    }
+    ntest++;
+
+    printf("%s: all_read_tensors_in_orig: ", __func__);
+    if (all_tensors_in_other(gguf_ctx_1, gguf_ctx_0)) {
+        printf("\033[1;32mOK\033[0m\n");
+        npass++;
+    } else {
+        printf("\033[1;31mFAIL\033[0m\n");
+    }
+    ntest++;
+
+    if (!only_meta) {
+        printf("%s: same_tensor_data: ", __func__);
+        if (same_tensor_data(ctx_0, ctx_1)) {
+            printf("\033[1;32mOK\033[0m\n");
+            npass++;
+        } else {
+            printf("\033[1;31mFAIL\033[0m\n");
+        }
+        ntest++;
+    }
+
+    ggml_backend_buffer_free(bbuf);
+    ggml_free(ctx_0);
+    ggml_free(ctx_1);
+    gguf_free(gguf_ctx_0);
+    gguf_free(gguf_ctx_1);
+    gguf_buf_free(gbuf);
+    ggml_backend_free(backend);
+    GGML_ASSERT(fclose(file) == 0);
+
+    printf("\n");
+    return std::make_pair(npass, ntest);
+}
+
+static std::pair<int, int> test_gguf_set_kv(ggml_backend_dev_t dev, const unsigned int seed) {
+    ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
+    printf("%s: device=%s, backend=%s\n", __func__, ggml_backend_dev_description(dev), ggml_backend_name(backend));
+
+    int npass = 0;
+    int ntest = 0;
+
+    struct gguf_context * gguf_ctx_0;
+    struct ggml_context * ctx_0;
+    ggml_backend_buffer_t bbuf_0;
+    {
+        struct random_gguf_context_result result = get_random_gguf_context(backend, seed);
+        gguf_ctx_0 = result.gguf_ctx;
+        ctx_0      = result.ctx;
+        bbuf_0     = result.buffer;
+    }
+
+    struct gguf_context * gguf_ctx_1;
+    struct ggml_context * ctx_1;
+    ggml_backend_buffer_t bbuf_1;
+    {
+        struct random_gguf_context_result result = get_random_gguf_context(backend, seed + 1);
+        gguf_ctx_1 = result.gguf_ctx;
+        ctx_1      = result.ctx;
+        bbuf_1     = result.buffer;
+    }
+
+    struct gguf_context * gguf_ctx_2 = gguf_init_empty();
+
+    gguf_set_kv(gguf_ctx_1, gguf_ctx_0);
+    gguf_set_kv(gguf_ctx_2, gguf_ctx_0);
+
+    printf("%s: same_n_kv: ", __func__);
+    if (gguf_get_n_kv(gguf_ctx_0) == gguf_get_n_kv(gguf_ctx_2)) {
+        printf("\033[1;32mOK\033[0m\n");
+        npass++;
+    } else {
+        printf("\033[1;31mFAIL\033[0m\n");
+    }
+    ntest++;
+
+    printf("%s: all_kv_0_in_1: ", __func__);
+    if (all_kv_in_other(gguf_ctx_0, gguf_ctx_1)) {
+        printf("\033[1;32mOK\033[0m\n");
+        npass++;
+    } else {
+        printf("\033[1;31mFAIL\033[0m\n");
+    }
+    ntest++;
+
+    printf("%s: all_kv_0_in_2: ", __func__);
+    if (all_kv_in_other(gguf_ctx_0, gguf_ctx_2)) {
+        printf("\033[1;32mOK\033[0m\n");
+        npass++;
+    } else {
+        printf("\033[1;31mFAIL\033[0m\n");
+    }
+    ntest++;
+
+    gguf_set_kv(gguf_ctx_0, gguf_ctx_1);
+
+    printf("%s: same_n_kv_after_double_copy: ", __func__);
+    if (gguf_get_n_kv(gguf_ctx_0) == gguf_get_n_kv(gguf_ctx_1)) {
+        printf("\033[1;32mOK\033[0m\n");
+        npass++;
+    } else {
+        printf("\033[1;31mFAIL\033[0m\n");
+    }
+    ntest++;
+
+    printf("%s: all_kv_1_in_0_after_double_copy: ", __func__);
+    if (all_kv_in_other(gguf_ctx_1, gguf_ctx_0)) {
+        printf("\033[1;32mOK\033[0m\n");
+        npass++;
+    } else {
+        printf("\033[1;31mFAIL\033[0m\n");
+    }
+    ntest++;
+
+    ggml_backend_buffer_free(bbuf_0);
+    ggml_backend_buffer_free(bbuf_1);
+    ggml_free(ctx_0);
+    ggml_free(ctx_1);
+    gguf_free(gguf_ctx_0);
+    gguf_free(gguf_ctx_1);
+    gguf_free(gguf_ctx_2);
+    ggml_backend_free(backend);
+
+    printf("\n");
+    return std::make_pair(npass, ntest);
+}
+
+static void print_usage() {
+    printf("usage: test-gguf [seed]\n");
+    printf("  if no seed is unspecified then a random seed is used\n");
+}
+
+int main(int argc, char ** argv) {
+    if (argc > 2) {
+        print_usage();
+        return 1;
+    }
+
+    std::random_device rd;
+    const unsigned int seed = argc < 2 ? rd() : std::stoi(argv[1]);
+
+    // Initialize ggml backends early so the prints aren't interleaved with the test results:
+    ggml_backend_dev_count();
+    fprintf(stderr, "\n");
+
+    int npass = 0;
+    int ntest = 0;
+    {
+        std::pair<int, int> result = test_handcrafted_file(seed);
+        npass += result.first;
+        ntest += result.second;
+    }
+
+    for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+
+        for (bool only_meta : {true, false}) {
+            std::pair<int, int> result = test_roundtrip(dev, seed, only_meta);
+            npass += result.first;
+            ntest += result.second;
+        }
+
+        {
+            std::pair<int, int> result = test_gguf_set_kv(dev, seed);
+            npass += result.first;
+            ntest += result.second;
+        }
+    }
+
+    printf("%d/%d tests passed\n", npass, ntest);
+    if (npass != ntest) {
+        printf("\033[1;31mFAIL\033[0m\n");
+        return 1;
+    }
+    printf("\033[1;32mOK\033[0m\n");
+    return 0;
+}
diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp
index 5cc0cdb0475..e1bdbb9250f 100644
--- a/tests/test-grammar-integration.cpp
+++ b/tests/test-grammar-integration.cpp
@@ -32,13 +32,10 @@ static bool test_build_grammar_fails(const std::string & grammar_str) {
 static bool match_string(const std::string & input, llama_grammar * grammar) {
     const auto cpts = unicode_cpts_from_utf8(input);
 
-    const llama_grammar_rules  & rules      = llama_grammar_get_rules (grammar);
-          llama_grammar_stacks & stacks_cur = llama_grammar_get_stacks(grammar);
+    auto & stacks_cur = llama_grammar_get_stacks(grammar);
 
     for (const auto & cpt : cpts) {
-        const llama_grammar_stacks stacks_prev = llama_grammar_get_stacks(grammar); // copy
-
-        llama_grammar_accept(rules, stacks_prev, cpt, stacks_cur);
+        llama_grammar_accept(grammar, cpt);
 
         if (stacks_cur.empty()) {
             // no stacks means that the grammar failed to match at this point
@@ -63,7 +60,7 @@ static void test(const std::string & test_desc, const std::string & grammar_str,
     auto * grammar = build_grammar(grammar_str);
 
     // Save the original grammar stacks so that we can reset after every new string we want to test
-    const llama_grammar_stacks stacks_org = llama_grammar_get_stacks(grammar);
+    const llama_grammar_stacks stacks_org = llama_grammar_get_stacks(grammar); // copy
 
     llama_grammar_stacks & stacks_cur = llama_grammar_get_stacks(grammar);
 
diff --git a/tests/test-llama-grammar.cpp b/tests/test-llama-grammar.cpp
index 6f1374ca8ed..e2129206be1 100644
--- a/tests/test-llama-grammar.cpp
+++ b/tests/test-llama-grammar.cpp
@@ -113,12 +113,10 @@ int main()
         }
     }
 
-    llama_grammar * grammar = NULL;
     std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
 
-    grammar = llama_grammar_init_impl(nullptr, grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
-    if (grammar == nullptr)
-    {
+    llama_grammar * grammar = llama_grammar_init_impl(nullptr, grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
+    if (grammar == nullptr) {
         throw std::runtime_error("Failed to initialize llama_grammar");
     }
 
diff --git a/tests/test-rope.cpp b/tests/test-rope.cpp
index 4656b30f09c..322b8bb99ec 100644
--- a/tests/test-rope.cpp
+++ b/tests/test-rope.cpp
@@ -138,7 +138,7 @@ int main(int /*argc*/, const char ** /*argv*/) {
     struct ggml_tensor * x;
 
     // rope f32
-    for (int m = 0; m < 3; ++m) {
+    for (int m = 0; m < 5; ++m) {
         const int ndims = 4;
 
         const int64_t n_rot = 128;
@@ -147,28 +147,69 @@ int main(int /*argc*/, const char ** /*argv*/) {
         const int n_past_0 = 100;
         const int n_past_2 = 33;
 
-        struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
-        struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
-        struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
-
-        for (int i = 0; i < ne[2]; ++i) {
-            ((int32_t *) p0->data)[i] = n_past_0 + i;
-            ((int32_t *) p1->data)[i] = n_past_2 - n_past_0;
-            ((int32_t *) p2->data)[i] = n_past_2 + i;
-        }
-
-        // test mode 0, 2, 4 (standard, GPT-NeoX, GLM)
-        const int mode = m == 0 ? 0 : m == 1 ? 2 : 4;
-
+        struct ggml_tensor * r0;
+        struct ggml_tensor * r1;
+        struct ggml_tensor * r2;
         x = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+        int mode = -1;
 
-        // 100, 101, 102, ..., 172
-        struct ggml_tensor * r0 = ggml_rope(ctx0, x,  p0, n_rot, mode);
-        // -67, -67, -67, ..., -67
-        struct ggml_tensor * r1 = ggml_rope(ctx0, r0, p1, n_rot, mode); // "context swap", i.e. forget n_past_0 - n_past_2 tokens
+        if (m < 3) {
+            struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
+            struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
+            struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
 
-        //  33,  34,  35, ..., 105
-        struct ggml_tensor * r2 = ggml_rope(ctx0, x,  p2, n_rot, mode);
+            for (int i = 0; i < ne[2]; ++i) {
+                ((int32_t *) p0->data)[i] = n_past_0 + i;
+                ((int32_t *) p1->data)[i] = n_past_2 - n_past_0;
+                ((int32_t *) p2->data)[i] = n_past_2 + i;
+            }
+            // test mode 0, 2, 4 (standard, GPT-NeoX, GLM)
+            mode = m == 0 ? 0 : m == 1 ? 2 : 4;
+
+            // 100, 101, 102, ..., 172
+            r0 = ggml_rope(ctx0, x,  p0, n_rot, mode);
+            // -67, -67, -67, ..., -67
+            r1 = ggml_rope(ctx0, r0, p1, n_rot, mode); // "context swap", i.e. forget n_past_0 - n_past_2 tokens
+
+            //  33,  34,  35, ..., 105
+            r2 = ggml_rope(ctx0, x,  p2, n_rot, mode);
+        } else {
+            // testing multi-dimension rope position embedding mode
+            struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2] * 4);
+            struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2] * 4);
+            struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2] * 4);
+
+            int sections[4] = {16, 24, 24, 0};
+            mode = (m == 3) ? GGML_ROPE_TYPE_MROPE : GGML_ROPE_TYPE_VISION;
+
+            for (int i = 0; i < ne[2]; ++i) {
+                for (int j = 0; j < 4; ++j) {
+                    ((int32_t *) p0->data)[i + ne[2] * j] = n_past_0 + i + j;
+                    ((int32_t *) p1->data)[i + ne[2] * j] = n_past_2 - n_past_0;
+                    ((int32_t *) p2->data)[i + ne[2] * j] = n_past_2 + i + j;
+                }
+            }
+
+            // [[100, 101, 102, ..., 172],
+            // [101, 102, 103, ..., 173],
+            // [102, 103, 104, ..., 174]]
+            r0 = ggml_rope_multi(
+                ctx0, x, p0, nullptr,
+                n_rot, sections, mode, 32768, 1000000, 1, 0, 1, 32, 1);
+            // [[-67, -67, -67, ..., -67]
+            // [-67, -67, -67, ..., -67]
+            // [-67, -67, -67, ..., -67]]
+            r1 = ggml_rope_multi(
+                ctx0, r0, p1, nullptr,
+                n_rot, sections, mode, 32768, 1000000, 1, 0, 1, 32, 1);
+
+            //  [[33,  34,  35, ..., 105]
+            //  [34,  35,  36, ..., 106]
+            //  [35,  36,  37, ..., 107]]
+            r2 = ggml_rope_multi(
+                ctx0, x, p2, nullptr,
+                n_rot, sections, mode, 32768, 1000000, 1, 0, 1, 32, 1);
+        }
 
         ggml_cgraph * gf = ggml_new_graph(ctx0);
 
diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp
index e5c9e75e418..c0dcb4848b4 100644
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@@ -145,7 +145,7 @@ static void test_penalties(
     sampler_tester tester(probs, probs_expected);
 
     const size_t n_vocab = probs.size();
-    auto * sampler = llama_sampler_init_penalties(n_vocab, LLAMA_TOKEN_NULL, LLAMA_TOKEN_NULL, last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence, false, false);
+    auto * sampler = llama_sampler_init_penalties(last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence);
 
     for (size_t i = 0; i < last_tokens.size(); i++) {
         llama_sampler_accept(sampler, last_tokens[i]);