Skip to content

Commit 1f105cc

Browse files
Merge pull request #100 from menloresearch/update-dev-from-master-2025-05-25-00-09
Sync master with upstream release b5477
2 parents 06c52d4 + a2d02d5 commit 1f105cc

File tree

10 files changed

+91
-15
lines changed

10 files changed

+91
-15
lines changed

.github/workflows/release.yml

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -260,16 +260,18 @@ jobs:
260260
architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
261261

262262
- name: Build
263+
shell: cmd
263264
env:
264265
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
265266
run: |
266-
cmake -S . -B build -G "Ninja Multi-Config" `
267-
-D CMAKE_TOOLCHAIN_FILE=cmake/${{ matrix.arch }}-windows-llvm.cmake `
268-
-DGGML_NATIVE=OFF `
269-
-DGGML_BACKEND_DL=ON `
270-
-DGGML_CPU_ALL_VARIANTS=ON `
271-
-DGGML_OPENMP=OFF `
272-
-DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" `
267+
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch }}
268+
cmake -S . -B build -G "Ninja Multi-Config" ^
269+
-D CMAKE_TOOLCHAIN_FILE=cmake/${{ matrix.arch }}-windows-llvm.cmake ^
270+
-DGGML_NATIVE=OFF ^
271+
-DGGML_BACKEND_DL=ON ^
272+
-DGGML_CPU_ALL_VARIANTS=${{ matrix.arch == 'x64' && 'ON' || 'OFF' }} ^
273+
-DGGML_OPENMP=ON ^
274+
-DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include" ^
273275
${{ env.CMAKE_ARGS }}
274276
cmake --build build --config Release
275277
@@ -279,6 +281,7 @@ jobs:
279281
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
280282
run: |
281283
Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
284+
Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.42.34433\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
282285
7z a llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*
283286
284287
- name: Upload artifacts

.github/workflows/winget.yml

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
name: Update Winget Package
2+
3+
on:
4+
workflow_dispatch: # allows manual triggering
5+
schedule:
6+
- cron: '28 5 * * *' # Update every day at 5:28 UTC
7+
8+
jobs:
9+
update:
10+
name: Update Winget Package
11+
runs-on: ubuntu-latest
12+
13+
steps:
14+
- name: Install cargo binstall
15+
uses: cargo-bins/cargo-binstall@268643a6b5ea099f5718ee5cd3ff7dc89a5eb49b
16+
17+
- name: Install komac
18+
run: |
19+
cargo binstall komac@2.11.2 -y
20+
21+
- name: Find latest release
22+
id: find_latest_release
23+
uses: actions/github-script@v6
24+
with:
25+
script: |
26+
const { data: releases } = await github.rest.repos.listReleases({
27+
owner: context.repo.owner,
28+
repo: context.repo.repo,
29+
});
30+
console.log("Latest release:", releases[0].tag_name);
31+
return releases[0].tag_name;
32+
33+
- name: Update manifest
34+
env:
35+
VERSION: ${{ steps.find_latest_release.outputs.result }}
36+
run: |
37+
echo "Updating manifest..."
38+
komac update --version ${{ env.VERSION }} \
39+
--urls "https://github.com/ggml-org/llama.cpp/releases/download/${{ env.VERSION }}/llama-${{ env.VERSION }}-bin-win-vulkan-x64.zip" \
40+
--token ${{ secrets.WINGET_GITHUB_TOKEN }} \
41+
--submit \
42+
ggml.llamacpp

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3484,6 +3484,19 @@ void ggml_cpu_init(void) {
34843484
const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
34853485

34863486
GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0);
3487+
3488+
#ifdef GGML_USE_OPENMP
3489+
//if (!getenv("OMP_WAIT_POLICY")) {
3490+
// // set the wait policy to active, so that OpenMP threads don't sleep
3491+
// putenv("OMP_WAIT_POLICY=active");
3492+
//}
3493+
3494+
if (!getenv("KMP_BLOCKTIME")) {
3495+
// set the time to wait before sleeping a thread
3496+
// this is less aggressive than setting the wait policy to active, but should achieve similar results in most cases
3497+
putenv("KMP_BLOCKTIME=200"); // 200ms
3498+
}
3499+
#endif
34873500
}
34883501

34893502
#if defined(__ARM_ARCH)

ggml/src/ggml-cuda/fattn-vec-f16.cuh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,7 @@ static __global__ void flash_attn_vec_ext_f16(
212212
}
213213
}
214214
if (__all_sync(0xFFFFFFFF, skip)) {
215+
__syncthreads();
215216
continue;
216217
}
217218
#endif // GGML_USE_HIP

ggml/src/ggml-cuda/fattn-vec-f32.cuh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,7 @@ static __global__ void flash_attn_vec_ext_f32(
217217
}
218218
}
219219
if (__all_sync(0xFFFFFFFF, skip)) {
220+
__syncthreads();
220221
continue;
221222
}
222223
#endif // GGML_USE_HIP

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2192,6 +2192,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
21922192
case GGML_UNARY_OP_SILU:
21932193
ggml_cuda_op_silu(ctx, dst);
21942194
break;
2195+
case GGML_UNARY_OP_GELU_ERF:
2196+
ggml_cuda_op_gelu_erf(ctx, dst);
2197+
break;
21952198
case GGML_UNARY_OP_GELU_QUICK:
21962199
ggml_cuda_op_gelu_quick(ctx, dst);
21972200
break;
@@ -2977,6 +2980,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
29772980
case GGML_UNARY_OP_SIGMOID:
29782981
case GGML_UNARY_OP_HARDSIGMOID:
29792982
case GGML_UNARY_OP_HARDSWISH:
2983+
case GGML_UNARY_OP_GELU_ERF:
29802984
case GGML_UNARY_OP_GELU_QUICK:
29812985
case GGML_UNARY_OP_TANH:
29822986
case GGML_UNARY_OP_EXP:

ggml/src/ggml-cuda/unary.cu

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,12 @@ static __device__ __forceinline__ float op_gelu(float x) {
2323
return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
2424
}
2525

26+
static __device__ __forceinline__ float op_gelu_erf(float x) {
27+
const float SQRT_2_INV = 0.70710678118654752440084436210484f;
28+
29+
return 0.5f*x*(1.0f + erff(x*SQRT_2_INV));
30+
}
31+
2632
static __device__ __forceinline__ float op_gelu_quick(float x) {
2733
const float GELU_QUICK_COEF = -1.702f;
2834

@@ -134,6 +140,10 @@ void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
134140
ggml_cuda_op_unary<op_gelu>(ctx, dst);
135141
}
136142

143+
void ggml_cuda_op_gelu_erf(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
144+
ggml_cuda_op_unary<op_gelu_erf>(ctx, dst);
145+
}
146+
137147
void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
138148
ggml_cuda_op_unary<op_gelu_quick>(ctx, dst);
139149
}

ggml/src/ggml-cuda/unary.cuh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
3030

3131
void ggml_cuda_op_silu_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
3232

33+
void ggml_cuda_op_gelu_erf(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
34+
3335
void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
3436

3537
void ggml_cuda_op_tanh(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

src/llama-graph.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1287,6 +1287,10 @@ ggml_tensor * llm_graph_context::build_attn(
12871287

12881288
if (wo) {
12891289
cur = build_lora_mm(wo, cur);
1290+
if (arch == LLM_ARCH_GLM4) {
1291+
// GLM4 seems to have numerical issues with half-precision accumulators
1292+
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
1293+
}
12901294
}
12911295

12921296
if (wo_b) {
@@ -1367,10 +1371,6 @@ ggml_tensor * llm_graph_context::build_attn(
13671371

13681372
if (wo) {
13691373
cur = build_lora_mm(wo, cur);
1370-
if (arch == LLM_ARCH_GLM4) {
1371-
// GLM4 seems to have numerical issues with half-precision accumulators
1372-
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
1373-
}
13741374
}
13751375

13761376
if (wo_b) {

src/llama-vocab.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -835,7 +835,7 @@ struct llm_tokenizer_ugm_session {
835835
}
836836

837837
// initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores
838-
std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.token_unk(), 0, -FLT_MAX});
838+
std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.token_unk(), 0, -DBL_MAX});
839839
// at the beginning tokenization score is zero
840840
tokenization_results[0] = { vocab.token_unk(), 0, 0 };
841841

@@ -867,7 +867,7 @@ struct llm_tokenizer_ugm_session {
867867
const double challenger_score = current_best.score_sum + token_score;
868868
struct best_tokenization & current_champ = tokenization_results[prefix_offset];
869869
if (challenger_score > current_champ.score_sum) {
870-
struct best_tokenization challenger = { token_id, input_offset, (float) challenger_score };
870+
struct best_tokenization challenger = { token_id, input_offset, challenger_score };
871871
current_champ = challenger;
872872
}
873873
}
@@ -881,7 +881,7 @@ struct llm_tokenizer_ugm_session {
881881
prefix_offset = input_offset + n_utf8_code_units;
882882
struct best_tokenization & current_champ = tokenization_results[prefix_offset];
883883
if (challenger_score > current_champ.score_sum) {
884-
struct best_tokenization challenger = { vocab.token_unk(), input_offset, (float) challenger_score };
884+
struct best_tokenization challenger = { vocab.token_unk(), input_offset, challenger_score };
885885
current_champ = challenger;
886886
}
887887
}
@@ -1007,7 +1007,7 @@ struct llm_tokenizer_ugm_session {
10071007
struct best_tokenization {
10081008
llama_token token_id;
10091009
size_t input_offset;
1010-
float score_sum;
1010+
double score_sum;
10111011
};
10121012

10131013
struct normalization_result normalize_prefix(const std::string & input, size_t input_offset) {

0 commit comments

Comments
 (0)