From a70a8a69c2ad3de8d5525ad2b185b098011be2e8 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Fri, 23 May 2025 13:09:38 -0700 Subject: [PATCH 1/9] ci : add winget package updater (#13732) --- .github/workflows/winget.yml | 42 ++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 .github/workflows/winget.yml diff --git a/.github/workflows/winget.yml b/.github/workflows/winget.yml new file mode 100644 index 0000000000000..0a1737ecdfd3c --- /dev/null +++ b/.github/workflows/winget.yml @@ -0,0 +1,42 @@ +name: Update Winget Package + +on: + workflow_dispatch: # allows manual triggering + schedule: + - cron: '28 5 * * *' # Update every day at 5:28 UTC + +jobs: + update: + name: Update Winget Package + runs-on: ubuntu-latest + + steps: + - name: Install cargo binstall + uses: cargo-bins/cargo-binstall@268643a6b5ea099f5718ee5cd3ff7dc89a5eb49b + + - name: Install komac + run: | + cargo binstall komac@2.11.2 -y + + - name: Find latest release + id: find_latest_release + uses: actions/github-script@v6 + with: + script: | + const { data: releases } = await github.rest.repos.listReleases({ + owner: context.repo.owner, + repo: context.repo.repo, + }); + console.log("Latest release:", releases[0].tag_name); + return releases[0].tag_name; + + - name: Update manifest + env: + VERSION: ${{ steps.find_latest_release.outputs.result }} + run: | + echo "Updating manifest..." + komac update --version ${{ env.VERSION }} \ + --urls "https://github.com/ggml-org/llama.cpp/releases/download/${{ env.VERSION }}/llama-${{ env.VERSION }}-bin-win-vulkan-x64.zip" \ + --token ${{ secrets.WINGET_GITHUB_TOKEN }} \ + --dry-run \ + ggml.llamacpp From b775345d788ac16260e7eef49e11fe57ee5677f7 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Fri, 23 May 2025 13:14:00 -0700 Subject: [PATCH 2/9] ci : enable winget package updates (#13734) --- .github/workflows/winget.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/winget.yml b/.github/workflows/winget.yml index 0a1737ecdfd3c..5c286155951e5 100644 --- a/.github/workflows/winget.yml +++ b/.github/workflows/winget.yml @@ -38,5 +38,5 @@ jobs: komac update --version ${{ env.VERSION }} \ --urls "https://github.com/ggml-org/llama.cpp/releases/download/${{ env.VERSION }}/llama-${{ env.VERSION }}-bin-win-vulkan-x64.zip" \ --token ${{ secrets.WINGET_GITHUB_TOKEN }} \ - --dry-run \ + --submit \ ggml.llamacpp From ffd0eae60b7642e942c8245b89642527e36099e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 24 May 2025 11:46:19 +0200 Subject: [PATCH 3/9] CUDA: fix race condition in FA vector kernels (#13742) --- ggml/src/ggml-cuda/fattn-vec-f16.cuh | 1 + ggml/src/ggml-cuda/fattn-vec-f32.cuh | 1 + 2 files changed, 2 insertions(+) diff --git a/ggml/src/ggml-cuda/fattn-vec-f16.cuh b/ggml/src/ggml-cuda/fattn-vec-f16.cuh index 798a59b277861..35e649cb3c81b 100644 --- a/ggml/src/ggml-cuda/fattn-vec-f16.cuh +++ b/ggml/src/ggml-cuda/fattn-vec-f16.cuh @@ -212,6 +212,7 @@ static __global__ void flash_attn_vec_ext_f16( } } if (__all_sync(0xFFFFFFFF, skip)) { + __syncthreads(); continue; } #endif // GGML_USE_HIP diff --git a/ggml/src/ggml-cuda/fattn-vec-f32.cuh b/ggml/src/ggml-cuda/fattn-vec-f32.cuh index 49c592ea59224..9539679177969 100644 --- a/ggml/src/ggml-cuda/fattn-vec-f32.cuh +++ b/ggml/src/ggml-cuda/fattn-vec-f32.cuh @@ -217,6 +217,7 @@ static __global__ void flash_attn_vec_ext_f32( } } if (__all_sync(0xFFFFFFFF, skip)) { + __syncthreads(); continue; } #endif // GGML_USE_HIP From c3a2624339187e89c4f65fd72a5fe7103968b5ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sat, 24 May 2025 12:29:09 +0200 Subject: [PATCH 4/9] vocab : fix ugm tokenizer precision (#13743) --- src/llama-vocab.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 9389ca805a584..d5a036a8c4413 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -835,7 +835,7 @@ struct llm_tokenizer_ugm_session { } // initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores - std::vector tokenization_results(input_len + 1, {vocab.token_unk(), 0, -FLT_MAX}); + std::vector tokenization_results(input_len + 1, {vocab.token_unk(), 0, -DBL_MAX}); // at the beginning tokenization score is zero tokenization_results[0] = { vocab.token_unk(), 0, 0 }; @@ -867,7 +867,7 @@ struct llm_tokenizer_ugm_session { const double challenger_score = current_best.score_sum + token_score; struct best_tokenization & current_champ = tokenization_results[prefix_offset]; if (challenger_score > current_champ.score_sum) { - struct best_tokenization challenger = { token_id, input_offset, (float) challenger_score }; + struct best_tokenization challenger = { token_id, input_offset, challenger_score }; current_champ = challenger; } } @@ -881,7 +881,7 @@ struct llm_tokenizer_ugm_session { prefix_offset = input_offset + n_utf8_code_units; struct best_tokenization & current_champ = tokenization_results[prefix_offset]; if (challenger_score > current_champ.score_sum) { - struct best_tokenization challenger = { vocab.token_unk(), input_offset, (float) challenger_score }; + struct best_tokenization challenger = { vocab.token_unk(), input_offset, challenger_score }; current_champ = challenger; } } @@ -1007,7 +1007,7 @@ struct llm_tokenizer_ugm_session { struct best_tokenization { llama_token token_id; size_t input_offset; - float score_sum; + double score_sum; }; struct normalization_result normalize_prefix(const std::string & input, size_t input_offset) { From 4c32832c59e97d96c19349fdc92763e8949fda83 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Sat, 24 May 2025 13:06:47 +0200 Subject: [PATCH 5/9] ggml : add ggml_gelu_erf() CUDA kernel (#13719) * ggml : add ggml_gelu_erf() CUDA kernel * missing semicolon --- ggml/src/ggml-cuda/ggml-cuda.cu | 4 ++++ ggml/src/ggml-cuda/unary.cu | 10 ++++++++++ ggml/src/ggml-cuda/unary.cuh | 2 ++ 3 files changed, 16 insertions(+) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 02dc8c12dbd8c..c442a64924303 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2192,6 +2192,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_UNARY_OP_SILU: ggml_cuda_op_silu(ctx, dst); break; + case GGML_UNARY_OP_GELU_ERF: + ggml_cuda_op_gelu_erf(ctx, dst); + break; case GGML_UNARY_OP_GELU_QUICK: ggml_cuda_op_gelu_quick(ctx, dst); break; @@ -2977,6 +2980,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_UNARY_OP_SIGMOID: case GGML_UNARY_OP_HARDSIGMOID: case GGML_UNARY_OP_HARDSWISH: + case GGML_UNARY_OP_GELU_ERF: case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_TANH: case GGML_UNARY_OP_EXP: diff --git a/ggml/src/ggml-cuda/unary.cu b/ggml/src/ggml-cuda/unary.cu index ec5773e01637e..2c0375fbe3cf6 100644 --- a/ggml/src/ggml-cuda/unary.cu +++ b/ggml/src/ggml-cuda/unary.cu @@ -23,6 +23,12 @@ static __device__ __forceinline__ float op_gelu(float x) { return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x))); } +static __device__ __forceinline__ float op_gelu_erf(float x) { + const float SQRT_2_INV = 0.70710678118654752440084436210484f; + + return 0.5f*x*(1.0f + erff(x*SQRT_2_INV)); +} + static __device__ __forceinline__ float op_gelu_quick(float x) { const float GELU_QUICK_COEF = -1.702f; @@ -134,6 +140,10 @@ void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { ggml_cuda_op_unary(ctx, dst); } +void ggml_cuda_op_gelu_erf(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + ggml_cuda_op_unary(ctx, dst); +} + void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { ggml_cuda_op_unary(ctx, dst); } diff --git a/ggml/src/ggml-cuda/unary.cuh b/ggml/src/ggml-cuda/unary.cuh index 940a1feed9a9c..6686fc17e9193 100644 --- a/ggml/src/ggml-cuda/unary.cuh +++ b/ggml/src/ggml-cuda/unary.cuh @@ -30,6 +30,8 @@ void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_silu_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst); +void ggml_cuda_op_gelu_erf(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_tanh(ggml_backend_cuda_context & ctx, ggml_tensor * dst); From 259469c4b57c1a32606353bcac52ba683424a990 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sat, 24 May 2025 16:49:12 +0200 Subject: [PATCH 6/9] Move GLM4 f32 attention fix to the correct function (#13750) --- src/llama-graph.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 13e36d161c614..cdd5887de961c 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1287,6 +1287,10 @@ ggml_tensor * llm_graph_context::build_attn( if (wo) { cur = build_lora_mm(wo, cur); + if (arch == LLM_ARCH_GLM4) { + // GLM4 seems to have numerical issues with half-precision accumulators + ggml_mul_mat_set_prec(cur, GGML_PREC_F32); + } } if (wo_b) { @@ -1367,10 +1371,6 @@ ggml_tensor * llm_graph_context::build_attn( if (wo) { cur = build_lora_mm(wo, cur); - if (arch == LLM_ARCH_GLM4) { - // GLM4 seems to have numerical issues with half-precision accumulators - ggml_mul_mat_set_prec(cur, GGML_PREC_F32); - } } if (wo_b) { From 2bd1b30f6979235ec67b95c183b9b77baa7ab9ce Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Sat, 24 May 2025 13:26:47 -0700 Subject: [PATCH 7/9] ggml-cpu : set openmp wait time if not set (#13758) --- ggml/src/ggml-cpu/ggml-cpu.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 46f75ad97cd61..aa51dc21a5de4 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -3484,6 +3484,19 @@ void ggml_cpu_init(void) { const uint64_t t_end = ggml_time_us(); UNUSED(t_end); GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0); + +#ifdef GGML_USE_OPENMP + //if (!getenv("OMP_WAIT_POLICY")) { + // // set the wait policy to active, so that OpenMP threads don't sleep + // putenv("OMP_WAIT_POLICY=active"); + //} + + if (!getenv("KMP_BLOCKTIME")) { + // set the time to wait before sleeping a thread + // this is less aggressive than setting the wait policy to active, but should achieve similar results in most cases + putenv("KMP_BLOCKTIME=200"); // 200ms + } +#endif } #if defined(__ARM_ARCH) From 17fc817b58942569c43aa63299cedde217b61f68 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Sat, 24 May 2025 13:27:03 -0700 Subject: [PATCH 8/9] releases : enable openmp in windows cpu backend build (#13756) --- .github/workflows/release.yml | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 94d57b3b9cfd4..d70bb31a99de1 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -260,16 +260,18 @@ jobs: architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }} - name: Build + shell: cmd env: CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} run: | - cmake -S . -B build -G "Ninja Multi-Config" ` - -D CMAKE_TOOLCHAIN_FILE=cmake/${{ matrix.arch }}-windows-llvm.cmake ` - -DGGML_NATIVE=OFF ` - -DGGML_BACKEND_DL=ON ` - -DGGML_CPU_ALL_VARIANTS=ON ` - -DGGML_OPENMP=OFF ` - -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" ` + call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch }} + cmake -S . -B build -G "Ninja Multi-Config" ^ + -D CMAKE_TOOLCHAIN_FILE=cmake/${{ matrix.arch }}-windows-llvm.cmake ^ + -DGGML_NATIVE=OFF ^ + -DGGML_BACKEND_DL=ON ^ + -DGGML_CPU_ALL_VARIANTS=${{ matrix.arch == 'x64' && 'ON' || 'OFF' }} ^ + -DGGML_OPENMP=ON ^ + -DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include" ^ ${{ env.CMAKE_ARGS }} cmake --build build --config Release From a2d02d5793fd9af7a7224773456501691b95fd02 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Sat, 24 May 2025 15:55:16 -0700 Subject: [PATCH 9/9] releases : bundle llvm omp library in windows release (#13763) --- .github/workflows/release.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index d70bb31a99de1..65ed244657e4f 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -281,6 +281,7 @@ jobs: CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} run: | Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\ + Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.42.34433\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\ 7z a llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\* - name: Upload artifacts