diff --git a/.github/actions/windows-setup-curl/action.yml b/.github/actions/windows-setup-curl/action.yml index 5d76da3d79ac5..446f799fac34a 100644 --- a/.github/actions/windows-setup-curl/action.yml +++ b/.github/actions/windows-setup-curl/action.yml @@ -5,6 +5,10 @@ inputs: description: 'CURL version' required: false default: '8.6.0_6' + architecture: + description: 'Architecture of the libcurl to download' + required: false + default: 'win64' outputs: curl_path: description: "Path to the downloaded libcurl" @@ -18,8 +22,9 @@ runs: shell: powershell env: CURL_VERSION: ${{ inputs.curl_version }} + ARCHITECTURE: ${{ inputs.architecture }} run: | - curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-win64-mingw.zip" + curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-${env:ARCHITECTURE}-mingw.zip" mkdir $env:RUNNER_TEMP/libcurl tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl echo "curl_path=$env:RUNNER_TEMP/libcurl" >> $env:GITHUB_OUTPUT diff --git a/.github/workflows/build-linux-cross.yml b/.github/workflows/build-linux-cross.yml index 1c38d7e11da6b..dbd31e589be3e 100644 --- a/.github/workflows/build-linux-cross.yml +++ b/.github/workflows/build-linux-cross.yml @@ -140,3 +140,94 @@ jobs: -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH cmake --build build --config Release -j $(nproc) + + ubuntu-24-ppc64el-cpu-cross: + runs-on: ubuntu-24.04 + + steps: + - uses: actions/checkout@v4 + - name: Setup PowerPC64le + run: | + sudo dpkg --add-architecture ppc64el + + # Add arch-specific repositories for non-amd64 architectures + cat << EOF | sudo tee /etc/apt/sources.list.d/ppc64el-ports.list + deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble main universe + deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe + deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe + deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe + EOF + + sudo apt-get update || true ;# Prevent failure due to missing URLs. + + sudo apt-get install -y --no-install-recommends \ + build-essential \ + gcc-14-powerpc64le-linux-gnu \ + g++-14-powerpc64le-linux-gnu \ + libcurl4-openssl-dev:ppc64el + + - name: Build + run: | + cmake -B build -DCMAKE_BUILD_TYPE=Release \ + -DGGML_OPENMP=OFF \ + -DLLAMA_BUILD_EXAMPLES=ON \ + -DLLAMA_BUILD_TOOLS=ON \ + -DLLAMA_BUILD_TESTS=OFF \ + -DCMAKE_SYSTEM_NAME=Linux \ + -DCMAKE_SYSTEM_PROCESSOR=ppc64 \ + -DCMAKE_C_COMPILER=powerpc64le-linux-gnu-gcc-14 \ + -DCMAKE_CXX_COMPILER=powerpc64le-linux-gnu-g++-14 \ + -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ + -DCMAKE_FIND_ROOT_PATH=/usr/lib/powerpc64le-linux-gnu \ + -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \ + -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \ + -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH + + cmake --build build --config Release -j $(nproc) + + ubuntu-24-ppc64el-vulkan-cross: + runs-on: ubuntu-24.04 + + steps: + - uses: actions/checkout@v4 + - name: Setup PowerPC64le + run: | + sudo dpkg --add-architecture ppc64el + + # Add arch-specific repositories for non-amd64 architectures + cat << EOF | sudo tee /etc/apt/sources.list.d/ppc64el-ports.list + deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble main universe + deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe + deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe + deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe + EOF + + sudo apt-get update || true ;# Prevent failure due to missing URLs. + + sudo apt-get install -y --no-install-recommends \ + build-essential \ + glslc \ + gcc-14-powerpc64le-linux-gnu \ + g++-14-powerpc64le-linux-gnu \ + libvulkan-dev:ppc64el \ + libcurl4-openssl-dev:ppc64el + + - name: Build + run: | + cmake -B build -DCMAKE_BUILD_TYPE=Release \ + -DGGML_VULKAN=ON \ + -DGGML_OPENMP=OFF \ + -DLLAMA_BUILD_EXAMPLES=ON \ + -DLLAMA_BUILD_TOOLS=ON \ + -DLLAMA_BUILD_TESTS=OFF \ + -DCMAKE_SYSTEM_NAME=Linux \ + -DCMAKE_SYSTEM_PROCESSOR=ppc64 \ + -DCMAKE_C_COMPILER=powerpc64le-linux-gnu-gcc-14 \ + -DCMAKE_CXX_COMPILER=powerpc64le-linux-gnu-g++-14 \ + -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ + -DCMAKE_FIND_ROOT_PATH=/usr/lib/powerpc64le-linux-gnu \ + -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \ + -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \ + -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH + + cmake --build build --config Release -j $(nproc) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 5f54909dcbd8c..02ff188855d6a 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -238,14 +238,19 @@ jobs: matrix: include: - build: 'cpu-x64' + arch: 'x64' defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF' #- build: 'openblas-x64' + # arch: 'x64' # defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' - build: 'vulkan-x64' + arch: 'x64' defines: '-DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON' - build: 'cpu-arm64' + arch: 'arm64' defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF' - build: 'opencl-adreno-arm64' + arch: 'arm64' defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON' steps: @@ -312,6 +317,8 @@ jobs: - name: libCURL id: get_libcurl uses: ./.github/actions/windows-setup-curl + with: + architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }} - name: Build id: cmake_build @@ -339,7 +346,7 @@ jobs: env: CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} run: | - Copy-Item $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll + Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\ 7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\* - name: Upload artifacts diff --git a/README.md b/README.md index 0401723ffcf87..5472f7abdeb21 100644 --- a/README.md +++ b/README.md @@ -572,4 +572,11 @@ automatically. For example: $ echo "source ~/.llama-completion.bash" >> ~/.bashrc ``` -## References +## Dependencies + +- [yhirose/cpp-httplib](https://github.com/yhirose/cpp-httplib) - Single-header HTTP server, used by `llama-server` - MIT license +- [stb-image](https://github.com/nothings/stb) - Single-header image format decoder, used by multimodal subsystem - Public domain +- [nlohmann/json](https://github.com/nlohmann/json) - Single-header JSON library, used by various tools/examples - MIT License +- [minja](https://github.com/google/minja) - Minimal Jinja parser in C++, used by various tools/examples - MIT License +- [linenoise.cpp](./tools/run/linenoise.cpp/linenoise.cpp) - C++ library that provides readline-like line editing capabilities, used by `llama-run` - BSD 2-Clause License +- [curl](https://curl.se/) - Client-side URL transfer library, used by various tools/examples - [CURL License](https://curl.se/docs/copyright.html) diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index db5d4094a17b8..a7ff3ac16c446 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -121,8 +121,8 @@ if (LLAMA_LLGUIDANCE) ExternalProject_Add(llguidance_ext GIT_REPOSITORY https://github.com/guidance-ai/llguidance - # v0.7.19 (+ fancy-regex build fix): - GIT_TAG b59f98f85269892a7de3d3641ad155366f13daa6 + # v0.7.20 (+ fix to build on GCC 15): + GIT_TAG b5b8b64dba11c4e4ee6b1d1450d3a3ae279891e8 PREFIX ${CMAKE_BINARY_DIR}/llguidance SOURCE_DIR ${LLGUIDANCE_SRC} BUILD_IN_SOURCE TRUE diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index f4b3d9cf5929c..85dbbcd5d7f99 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -415,6 +415,13 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK192_HV128, GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256, GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK576_HV512, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H64, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H64, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H64, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H64, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H64, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H64, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H64, GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H96, GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H96, GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H96, @@ -1362,6 +1369,13 @@ @implementation GGMLMetalClass GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK192_HV128, flash_attn_ext_q8_0_hk192_hv128, has_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256, flash_attn_ext_q8_0_h256, has_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK576_HV512, flash_attn_ext_q8_0_hk576_hv512, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H64, flash_attn_ext_vec_f16_h64, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H64, flash_attn_ext_vec_bf16_h64, has_simdgroup_reduction && use_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H64, flash_attn_ext_vec_q4_0_h64, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H64, flash_attn_ext_vec_q4_1_h64, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H64, flash_attn_ext_vec_q5_0_h64, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H64, flash_attn_ext_vec_q5_1_h64, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H64, flash_attn_ext_vec_q8_0_h64, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H96, flash_attn_ext_vec_f16_h96, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H96, flash_attn_ext_vec_bf16_h96, has_simdgroup_reduction && use_bfloat); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H96, flash_attn_ext_vec_q4_0_h96, has_simdgroup_reduction); @@ -4358,7 +4372,7 @@ static bool ggml_metal_encode_node( // TODO: add vec kernels for (ne00%64 == 0) and maybe also for (ne00%32 == 0) // for now avoiding mainly to keep the number of templates/kernels a bit lower // these are now trivial to add after: https://github.com/ggml-org/llama.cpp/pull/12612 - if (ne01 >= 20 || (ne00%128 != 0 && ne00 != 96 && ne00 != 192 && ne00 != 576)) { + if (ne01 >= 20 || (ne00%128 != 0 && ne00 != 64 && ne00 != 96 && ne00 != 192 && ne00 != 576)) { switch (src1->type) { case GGML_TYPE_F16: { @@ -4539,6 +4553,24 @@ static bool ggml_metal_encode_node( use_vec_kernel = true; switch (ne00) { + case 64: + { + switch (src1->type) { + case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H64].pipeline; break; + case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H64].pipeline; break; + case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H64].pipeline; break; + case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H64].pipeline; break; + case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H64].pipeline; break; + case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H64].pipeline; break; + case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H64].pipeline; break; + default: + { + GGML_LOG_ERROR("unsupported type: %d\n", src1->type); + GGML_LOG_ERROR("add template specialization for this type\n"); + GGML_ABORT("add template specialization for this type"); + } + } + } break; case 96: { switch (src1->type) { diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 122ae59737196..e94b6cd756441 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -4124,6 +4124,16 @@ kernel void kernel_flash_attn_ext_vec( typedef decltype(kernel_flash_attn_ext_vec) flash_attn_ext_vec_t; +template [[host_name("kernel_flash_attn_ext_vec_f16_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +#if defined(GGML_METAL_USE_BF16) +template [[host_name("kernel_flash_attn_ext_vec_bf16_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +#endif +template [[host_name("kernel_flash_attn_ext_vec_q4_0_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q4_1_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q5_0_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q5_1_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q8_0_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; + template [[host_name("kernel_flash_attn_ext_vec_f16_h96")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; #if defined(GGML_METAL_USE_BF16) template [[host_name("kernel_flash_attn_ext_vec_bf16_h96")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp index dcc6ec809a7d1..becaac4048a7f 100644 --- a/ggml/src/ggml-sycl/element_wise.cpp +++ b/ggml/src/ggml-sycl/element_wise.cpp @@ -655,7 +655,6 @@ inline void ggml_sycl_op_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) } default: GGML_ABORT("GGML tensor type not supported!\n"); - break; } } @@ -688,7 +687,6 @@ inline void ggml_sycl_op_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) } default: GGML_ABORT("GGML tensor type not supported!\n"); - break; } } @@ -722,7 +720,6 @@ inline void ggml_sycl_op_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) } default: GGML_ABORT("GGML tensor type not supported!\n"); - break; } } @@ -754,7 +751,6 @@ inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst } default: GGML_ABORT("GGML tensor type not supported!\n"); - break; } } @@ -786,7 +782,6 @@ inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst } default: GGML_ABORT("GGML tensor type not supported!\n"); - break; } } @@ -818,7 +813,6 @@ inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor } default: GGML_ABORT("GGML tensor type not supported!\n"); - break; } } @@ -850,7 +844,6 @@ inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst } default: GGML_ABORT("GGML tensor type not supported!\n"); - break; } } @@ -883,7 +876,6 @@ inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst } default: GGML_ABORT("GGML tensor type not supported!\n"); - break; } } @@ -917,7 +909,6 @@ inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tenso } default: GGML_ABORT("GGML tensor type not supported!\n"); - break; } } @@ -949,7 +940,6 @@ inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor } default: GGML_ABORT("GGML tensor type not supported!\n"); - break; } } @@ -981,7 +971,6 @@ inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) } default: GGML_ABORT("GGML tensor type not supported!\n"); - break; } } @@ -1013,7 +1002,6 @@ inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) } default: GGML_ABORT("GGML tensor type not supported!\n"); - break; } } @@ -1045,7 +1033,6 @@ inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * } default: GGML_ABORT("GGML tensor type not supported!\n"); - break; } } @@ -1078,7 +1065,6 @@ inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst } default: GGML_ABORT("GGML tensor type not supported!\n"); - break; } } @@ -1110,7 +1096,6 @@ inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) } default: GGML_ABORT("GGML tensor type not supported!\n"); - break; } } @@ -1142,7 +1127,6 @@ inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) } default: GGML_ABORT("GGML tensor type not supported!\n"); - break; } } @@ -1174,7 +1158,6 @@ inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst } default: GGML_ABORT("GGML tensor type not supported!\n"); - break; } } @@ -1206,7 +1189,6 @@ inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) } default: GGML_ABORT("GGML tensor type not supported!\n"); - break; } } @@ -1241,7 +1223,6 @@ inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor } default: GGML_ABORT("GGML tensor type not supported!\n"); - break; } } @@ -1273,7 +1254,6 @@ inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) } default: GGML_ABORT("GGML tensor type not supported!\n"); - break; } } @@ -1315,7 +1295,6 @@ inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * } default: GGML_ABORT("GGML tensor type not supported!\n"); - break; } } @@ -1350,7 +1329,6 @@ inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) } default: GGML_ABORT("GGML tensor type not supported!\n"); - break; } } @@ -1388,7 +1366,6 @@ inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * ds } default: GGML_ABORT("GGML tensor type not supported!\n"); - break; } } diff --git a/src/llama.cpp b/src/llama.cpp index 9fdddf7b071f8..2f06e0f8ce12d 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -140,6 +140,11 @@ static struct llama_model * llama_model_load_from_file_impl( struct llama_model_params params) { ggml_time_init(); + if (!params.vocab_only && ggml_backend_reg_count() == 0) { + LLAMA_LOG_ERROR("%s: no backends are loaded. hint: use ggml_backend_load() or ggml_backend_load_all() to load a backend before calling this function\n", __func__); + return nullptr; + } + unsigned cur_percentage = 0; if (params.progress_callback == NULL) { params.progress_callback_user_data = &cur_percentage; diff --git a/tools/server/public/index.html.gz b/tools/server/public/index.html.gz index 01eec46e842ac..02fb00339ec8d 100644 Binary files a/tools/server/public/index.html.gz and b/tools/server/public/index.html.gz differ diff --git a/tools/server/server.cpp b/tools/server/server.cpp index f32f3c86aad2c..129d013ac75f7 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -2251,6 +2251,14 @@ struct server_context { slot.has_next_token = true; } + // if context shifting is disabled, make sure that we don't run out of context + if (!params_base.ctx_shift && slot.n_past + 1 >= slot.n_ctx) { + slot.stop = STOP_TYPE_LIMIT; + slot.has_next_token = false; + + SLT_DBG(slot, "stopped due to running out of context, n_past = %d, n_ctx = %d\n", slot.n_past, slot.n_ctx); + } + // check the limits if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params_base)) { slot.stop = STOP_TYPE_LIMIT; diff --git a/tools/server/tests/unit/test_ctx_shift.py b/tools/server/tests/unit/test_ctx_shift.py index be93a6d31f410..2431ac70882d7 100644 --- a/tools/server/tests/unit/test_ctx_shift.py +++ b/tools/server/tests/unit/test_ctx_shift.py @@ -65,3 +65,21 @@ def test_ctx_shift_disabled_long_prompt(): assert res.status_code != 200 assert "error" in res.body assert "exceeds the available context size" in res.body["error"]["message"] + +def test_ctx_shift_disabled_stream(): + global server + server.disable_ctx_shift = True + server.start() + res = server.make_stream_request("POST", "/v1/completions", data={ + "n_predict": 256, + "prompt": "Once", + "stream": True, + }) + content = "" + for data in res: + choice = data["choices"][0] + if choice["finish_reason"] == "length": + assert len(content) > 0 + else: + assert choice["finish_reason"] is None + content += choice["text"] diff --git a/tools/server/webui/src/App.tsx b/tools/server/webui/src/App.tsx index 3b00a8f909ad6..1b673bbaa1cce 100644 --- a/tools/server/webui/src/App.tsx +++ b/tools/server/webui/src/App.tsx @@ -28,13 +28,13 @@ function AppLayout() { return ( <> -
-
+ { +
{items.map((item, i) => (
clickToShow && setShow(i)} + tabIndex={0} + aria-description={ + clickToShow ? `Click to show: ${item.name}` : undefined + } + role={clickToShow ? 'button' : 'menuitem'} > {removeItem && (
{showingItem.type === 'imageFile' ? ( - {showingItem.name} + {`Preview ) : (
diff --git a/tools/server/webui/src/components/ChatMessage.tsx b/tools/server/webui/src/components/ChatMessage.tsx
index 08eb423526b53..ee59de450d1ff 100644
--- a/tools/server/webui/src/components/ChatMessage.tsx
+++ b/tools/server/webui/src/components/ChatMessage.tsx
@@ -83,13 +83,20 @@ export default function ChatMessage({
 
   if (!viewingChat) return null;
 
+  const isUser = msg.role === 'user';
+
   return (
-    
+
{msg.extra && msg.extra.length > 0 && ( @@ -99,7 +106,7 @@ export default function ChatMessage({
{/* textarea for editing message */} @@ -142,7 +149,7 @@ export default function ChatMessage({ ) : ( <> {/* render message as markdown */} -
+
{thought && ( {siblingLeafNodeIds && siblingLeafNodeIds.length > 1 && ( -
+
@@ -215,6 +227,7 @@ export default function ChatMessage({ 'opacity-20': !nextSibling, })} onClick={() => nextSibling && onChangeSibling(nextSibling)} + aria-label="Next message version" > @@ -223,7 +236,7 @@ export default function ChatMessage({ {/* user message */} {msg.role === 'user' && ( setEditingContent(msg.content)} disabled={msg.content === null} tooltipsContent="Edit message" @@ -236,7 +249,7 @@ export default function ChatMessage({ <> {!isPending && ( { if (msg.content !== null) { onRegenerateMessage(msg as Message); @@ -250,10 +263,7 @@ export default function ChatMessage({ )} )} - +
)}
@@ -271,6 +281,8 @@ function ThoughtProcess({ }) { return (
-
+
diff --git a/tools/server/webui/src/components/ChatScreen.tsx b/tools/server/webui/src/components/ChatScreen.tsx index 661fe14905a8f..09c601ef2366a 100644 --- a/tools/server/webui/src/components/ChatScreen.tsx +++ b/tools/server/webui/src/components/ChatScreen.tsx @@ -279,7 +279,11 @@ export default function ChatScreen() { function ServerInfo() { const { serverProps } = useAppContext(); return ( -
+
Server Info

@@ -311,6 +315,8 @@ function ChatInput({ return (

diff --git a/tools/server/webui/src/components/Header.tsx b/tools/server/webui/src/components/Header.tsx index 45775ff7a6258..ccddc21ddab73 100644 --- a/tools/server/webui/src/components/Header.tsx +++ b/tools/server/webui/src/components/Header.tsx @@ -38,8 +38,12 @@ export default function Header() { {/* action buttons (top right) */}
-
- diff --git a/tools/server/webui/src/components/SettingDialog.tsx b/tools/server/webui/src/components/SettingDialog.tsx index 0240a17f407a4..e4684be7e007c 100644 --- a/tools/server/webui/src/components/SettingDialog.tsx +++ b/tools/server/webui/src/components/SettingDialog.tsx @@ -335,14 +335,22 @@ export default function SettingDialog({ }; return ( - +

Settings

{/* Left panel, showing sections - Desktop version */} -
+
{SETTING_SECTIONS.map((section, idx) => ( -
{section.title} -
+ ))}
{/* Left panel, showing sections - Mobile version */} -
+ {/* This menu is skipped on a11y, otherwise it's repeated the desktop version */} +
{SETTING_SECTIONS[sectionIdx].title} diff --git a/tools/server/webui/src/components/Sidebar.tsx b/tools/server/webui/src/components/Sidebar.tsx index 8e79e00b8dd4f..8cac52f4c6ddf 100644 --- a/tools/server/webui/src/components/Sidebar.tsx +++ b/tools/server/webui/src/components/Sidebar.tsx @@ -50,44 +50,72 @@ export default function Sidebar() { id="toggle-drawer" type="checkbox" className="drawer-toggle" + aria-label="Toggle sidebar" defaultChecked /> -
+
+ + + Skip to main content + +
-

Conversations

+

+ Conversations +

{/* close sidebar button */} -
{/* new conversation button */} -
navigate('/')} + aria-label="New conversation" > New conversation -
+ {/* list of conversations */} {groupedConv.map((group, i) => ( -
+
{/* group name (by date) */} {group.title ? ( // we use btn class here to make sure that the padding/margin are aligned with the other items - + {group.title} ) : ( @@ -184,20 +212,23 @@ function ConversationItem({ }) { return (
-
{conv.name} -
+
{/* dropdown menu */}