From 8a347a007512bbaee52634352dbeb972d0b8aa02 Mon Sep 17 00:00:00 2001 From: There Is No TIme <37583483+thereisnotime@users.noreply.github.com> Date: Thu, 7 May 2026 22:52:52 +0300 Subject: [PATCH 1/6] feat(api): expose load-time tuning knobs on ModelParams Add the following fields to `ModelParams`, all mapped to existing fields on `llama_model_params` / `llama_context_params`: - useMmap (bool, default true) -> use_mmap - useMlock (bool, default false) -> use_mlock - flashAttention (FlashAttention enum: auto/enabled/disabled, default auto) -> flash_attn_type - cacheTypeK / cacheTypeV (KvCacheType enum: f16/q8_0/q4_0, default f16) -> type_k / type_v - kvUnified (bool?, default null = current heuristic) -> kv_unified - ropeFrequencyBase / ropeFrequencyScale (double?, default null = model's trained value) -> rope_freq_base / rope_freq_scale The defaults preserve current behavior. User-explicit settings are applied after the existing platform/backend heuristics so they win. Quality-of-life: when a non-F16 KV cache type is requested with `flashAttention: auto`, the service auto-promotes flash attention to enabled (llama.cpp refuses non-F16 KV cache without it). The motivation for this change is matching what other llama.cpp wrappers expose so memory-constrained mobile callers can run larger context windows. With Q8_0 KV the cache memory roughly halves vs F16, which on a 12 GB Android device is the difference between running a 12B model at n_ctx=4096 vs 8192. Tests cover the new defaults, copyWith propagation, and enum surface. Mirrored unit-structure test now sees sibling tests for both new config files. Native binaries are unaffected; the underlying struct fields were already in the autogenerated bindings. --- CHANGELOG.md | 25 +++++++ lib/llamadart.dart | 2 + .../backends/llama_cpp/llama_cpp_service.dart | 56 ++++++++++++++- .../core/models/config/flash_attention.dart | 12 ++++ lib/src/core/models/config/kv_cache_type.dart | 13 ++++ .../core/models/inference/model_params.dart | 55 +++++++++++++- .../models/config/flash_attention_test.dart | 11 +++ .../models/config/kv_cache_type_test.dart | 11 +++ .../models/inference/model_params_test.dart | 71 +++++++++++++++++++ 9 files changed, 254 insertions(+), 2 deletions(-) create mode 100644 lib/src/core/models/config/flash_attention.dart create mode 100644 lib/src/core/models/config/kv_cache_type.dart create mode 100644 test/unit/core/models/config/flash_attention_test.dart create mode 100644 test/unit/core/models/config/kv_cache_type_test.dart diff --git a/CHANGELOG.md b/CHANGELOG.md index f4340b58..0519d208 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,31 @@ * **Native runtime sync**: * Updated native hook pinning to `leehack/llamadart-native@b9016`, picking up the CUDA 12.8 Blackwell-capable native bundles. +* **Load-time tuning knobs**: + * Added `ModelParams.useMmap` (default `true`) and + `ModelParams.useMlock` (default `false`), wired to + `llama_model_params.use_mmap` / `use_mlock`. Lets callers turn off mmap + for platforms where memory-mapped weights hurt throughput, or pin + weights in RAM to avoid first-token paging spikes. + * Added `ModelParams.flashAttention` with the `FlashAttention.{auto, + enabled, disabled}` enum, wired to + `llama_context_params.flash_attn_type`. Explicit settings win over the + existing automatic Android/Vulkan heuristics; `auto` preserves prior + behavior. + * Added `ModelParams.cacheTypeK` and `ModelParams.cacheTypeV` with the + `KvCacheType.{f16, q8_0, q4_0}` enum, wired to + `llama_context_params.type_k` / `type_v`. Enables KV-cache + quantization (Q8_0 ≈ halves KV memory; Q4_0 ≈ quarters it). When the + user requests a non-F16 KV type with `flashAttention: auto`, the + service auto-promotes flash attention to enabled — llama.cpp requires + it for KV quantization. + * Added `ModelParams.kvUnified` (nullable) for explicit override of + `llama_context_params.kv_unified`. `null` keeps the existing + auto-enable-when-multi-sequence behavior. + * Added `ModelParams.ropeFrequencyBase` and + `ModelParams.ropeFrequencyScale` (both nullable) for + context-extension overrides on `llama_context_params.rope_freq_base` / + `rope_freq_scale`. `null` keeps the model's trained values. * **GPU device selection API**: * Added `ModelParams.mainGpu` and wired it to llama.cpp `llama_model_params.main_gpu`. diff --git a/lib/llamadart.dart b/lib/llamadart.dart index 3ab5e8c8..c20c1518 100644 --- a/lib/llamadart.dart +++ b/lib/llamadart.dart @@ -69,6 +69,8 @@ export 'src/core/models/tools/tool_params.dart'; export 'src/core/llama_logger.dart'; export 'src/core/models/config/log_level.dart'; export 'src/core/models/config/gpu_backend.dart'; +export 'src/core/models/config/flash_attention.dart'; +export 'src/core/models/config/kv_cache_type.dart'; export 'src/core/models/config/lora_config.dart'; // Utils diff --git a/lib/src/backends/llama_cpp/llama_cpp_service.dart b/lib/src/backends/llama_cpp/llama_cpp_service.dart index ee3ebb1d..44a8aa69 100644 --- a/lib/src/backends/llama_cpp/llama_cpp_service.dart +++ b/lib/src/backends/llama_cpp/llama_cpp_service.dart @@ -7,8 +7,11 @@ import 'dart:math' as math; import 'package:ffi/ffi.dart'; import 'package:path/path.dart' as path; +import '../../core/llama_logger.dart'; import '../../core/models/chat/content_part.dart'; +import '../../core/models/config/flash_attention.dart'; import '../../core/models/config/gpu_backend.dart'; +import '../../core/models/config/kv_cache_type.dart'; import '../../core/models/config/log_level.dart'; import '../../core/models/inference/generation_params.dart'; import '../../core/models/inference/model_params.dart'; @@ -389,6 +392,17 @@ class LlamaCppService { normalized.contains('qwen_qwen3.5-0.8b'); } + static ggml_type _ggmlTypeFor(KvCacheType type) { + switch (type) { + case KvCacheType.f16: + return ggml_type.GGML_TYPE_F16; + case KvCacheType.q8_0: + return ggml_type.GGML_TYPE_Q8_0; + case KvCacheType.q4_0: + return ggml_type.GGML_TYPE_Q4_0; + } + } + // --- Core Methods --- /// Sets the log level for the Llama.cpp library. @@ -1267,7 +1281,8 @@ class LlamaCppService { mparams.n_gpu_layers = gpuLayers; mparams.split_modeAsInt = modelParams.splitMode.llamaCppValue; mparams.main_gpu = modelParams.mainGpu; - mparams.use_mmap = true; + mparams.use_mmap = modelParams.useMmap; + mparams.use_mlock = modelParams.useMlock; if (preferredDevices != null) { mparams.devices = preferredDevices; } @@ -2524,6 +2539,45 @@ class LlamaCppService { } } + // User-explicit overrides win over the heuristics above; `auto`/null + // defaults are no-ops so existing behavior is preserved. + final wantsKvQuantization = + params.cacheTypeK != KvCacheType.f16 || + params.cacheTypeV != KvCacheType.f16; + var resolvedFlashAttn = params.flashAttention; + if (resolvedFlashAttn == FlashAttention.auto && wantsKvQuantization) { + // llama.cpp refuses non-F16 KV cache without flash attention. + LlamaLogger.instance.debug( + 'llama_cpp_service: promoting flash_attn=enabled for non-F16 KV ' + '(k=${params.cacheTypeK}, v=${params.cacheTypeV})', + ); + resolvedFlashAttn = FlashAttention.enabled; + } + switch (resolvedFlashAttn) { + case FlashAttention.auto: + // Leave whatever the heuristic above (or llama.cpp default) chose. + break; + case FlashAttention.enabled: + ctxParams.flash_attn_typeAsInt = + llama_flash_attn_type.LLAMA_FLASH_ATTN_TYPE_ENABLED.value; + break; + case FlashAttention.disabled: + ctxParams.flash_attn_typeAsInt = + llama_flash_attn_type.LLAMA_FLASH_ATTN_TYPE_DISABLED.value; + break; + } + ctxParams.type_kAsInt = _ggmlTypeFor(params.cacheTypeK).value; + ctxParams.type_vAsInt = _ggmlTypeFor(params.cacheTypeV).value; + if (params.kvUnified != null) { + ctxParams.kv_unified = params.kvUnified!; + } + if (params.ropeFrequencyBase != null) { + ctxParams.rope_freq_base = params.ropeFrequencyBase!; + } + if (params.ropeFrequencyScale != null) { + ctxParams.rope_freq_scale = params.ropeFrequencyScale!; + } + final ctxPtr = llama_init_from_model(model.pointer, ctxParams); if (ctxPtr == nullptr) { throw Exception("Failed to create context"); diff --git a/lib/src/core/models/config/flash_attention.dart b/lib/src/core/models/config/flash_attention.dart new file mode 100644 index 00000000..5fd29c7a --- /dev/null +++ b/lib/src/core/models/config/flash_attention.dart @@ -0,0 +1,12 @@ +/// Selects llama.cpp's `flash_attn_type`. Required when [KvCacheType] is +/// not [KvCacheType.f16] — llama.cpp refuses non-F16 KV cache without it. +enum FlashAttention { + /// Let llama.cpp pick. + auto, + + /// Force on. + enabled, + + /// Force off. + disabled, +} diff --git a/lib/src/core/models/config/kv_cache_type.dart b/lib/src/core/models/config/kv_cache_type.dart new file mode 100644 index 00000000..ff014538 --- /dev/null +++ b/lib/src/core/models/config/kv_cache_type.dart @@ -0,0 +1,13 @@ +/// KV-cache data type for `llama_context_params.type_k` / `type_v`. +/// q8_0 ≈ 0.5× the KV memory of f16; q4_0 ≈ 0.25×. Both require flash +/// attention to be enabled (see [FlashAttention]). +enum KvCacheType { + /// fp16 (default). + f16, + + /// 8-bit quantized. + q8_0, + + /// 4-bit quantized. + q4_0, +} diff --git a/lib/src/core/models/inference/model_params.dart b/lib/src/core/models/inference/model_params.dart index baea2456..d58372ca 100644 --- a/lib/src/core/models/inference/model_params.dart +++ b/lib/src/core/models/inference/model_params.dart @@ -1,5 +1,6 @@ +import '../config/flash_attention.dart'; import '../config/gpu_backend.dart'; - +import '../config/kv_cache_type.dart'; import '../config/lora_config.dart'; /// Strategy for distributing model tensors across GPU devices. @@ -111,6 +112,34 @@ class ModelParams { /// Set to 1 to preserve single-sequence behavior. final int maxParallelSequences; + /// `llama_model_params.use_mmap`. Default `true`. + final bool useMmap; + + /// `llama_model_params.use_mlock`. Default `false`. + final bool useMlock; + + /// `llama_context_params.flash_attn_type`. User-explicit values override + /// the platform/backend heuristic. + final FlashAttention flashAttention; + + /// `llama_context_params.type_k`. Non-F16 requires [flashAttention] enabled. + final KvCacheType cacheTypeK; + + /// `llama_context_params.type_v`. Non-F16 requires [flashAttention] enabled. + final KvCacheType cacheTypeV; + + /// `llama_context_params.kv_unified`. `null` keeps the current heuristic + /// (auto-enabled when [maxParallelSequences] > 1). + final bool? kvUnified; + + /// `llama_context_params.rope_freq_base`. `null` keeps the model's + /// trained value. + final double? ropeFrequencyBase; + + /// `llama_context_params.rope_freq_scale`. `null` keeps the model's + /// trained value. + final double? ropeFrequencyScale; + /// Maximum number of GPU layers to safely offload all layers. static const int maxGpuLayers = 999; @@ -128,6 +157,14 @@ class ModelParams { this.batchSize = 0, this.microBatchSize = 0, this.maxParallelSequences = 1, + this.useMmap = true, + this.useMlock = false, + this.flashAttention = FlashAttention.auto, + this.cacheTypeK = KvCacheType.f16, + this.cacheTypeV = KvCacheType.f16, + this.kvUnified, + this.ropeFrequencyBase, + this.ropeFrequencyScale, }); /// Creates a copy of this [ModelParams] with updated fields. @@ -144,6 +181,14 @@ class ModelParams { int? batchSize, int? microBatchSize, int? maxParallelSequences, + bool? useMmap, + bool? useMlock, + FlashAttention? flashAttention, + KvCacheType? cacheTypeK, + KvCacheType? cacheTypeV, + bool? kvUnified, + double? ropeFrequencyBase, + double? ropeFrequencyScale, }) { return ModelParams( contextSize: contextSize ?? this.contextSize, @@ -158,6 +203,14 @@ class ModelParams { batchSize: batchSize ?? this.batchSize, microBatchSize: microBatchSize ?? this.microBatchSize, maxParallelSequences: maxParallelSequences ?? this.maxParallelSequences, + useMmap: useMmap ?? this.useMmap, + useMlock: useMlock ?? this.useMlock, + flashAttention: flashAttention ?? this.flashAttention, + cacheTypeK: cacheTypeK ?? this.cacheTypeK, + cacheTypeV: cacheTypeV ?? this.cacheTypeV, + kvUnified: kvUnified ?? this.kvUnified, + ropeFrequencyBase: ropeFrequencyBase ?? this.ropeFrequencyBase, + ropeFrequencyScale: ropeFrequencyScale ?? this.ropeFrequencyScale, ); } } diff --git a/test/unit/core/models/config/flash_attention_test.dart b/test/unit/core/models/config/flash_attention_test.dart new file mode 100644 index 00000000..f0920625 --- /dev/null +++ b/test/unit/core/models/config/flash_attention_test.dart @@ -0,0 +1,11 @@ +import 'package:llamadart/src/core/models/config/flash_attention.dart'; +import 'package:test/test.dart'; + +void main() { + test('FlashAttention enum contains expected values', () { + expect(FlashAttention.values, contains(FlashAttention.auto)); + expect(FlashAttention.values, contains(FlashAttention.enabled)); + expect(FlashAttention.values, contains(FlashAttention.disabled)); + expect(FlashAttention.values.length, 3); + }); +} diff --git a/test/unit/core/models/config/kv_cache_type_test.dart b/test/unit/core/models/config/kv_cache_type_test.dart new file mode 100644 index 00000000..f04c10d6 --- /dev/null +++ b/test/unit/core/models/config/kv_cache_type_test.dart @@ -0,0 +1,11 @@ +import 'package:llamadart/src/core/models/config/kv_cache_type.dart'; +import 'package:test/test.dart'; + +void main() { + test('KvCacheType enum contains expected values', () { + expect(KvCacheType.values, contains(KvCacheType.f16)); + expect(KvCacheType.values, contains(KvCacheType.q8_0)); + expect(KvCacheType.values, contains(KvCacheType.q4_0)); + expect(KvCacheType.values.length, 3); + }); +} diff --git a/test/unit/core/models/inference/model_params_test.dart b/test/unit/core/models/inference/model_params_test.dart index ba26a329..aa141a5d 100644 --- a/test/unit/core/models/inference/model_params_test.dart +++ b/test/unit/core/models/inference/model_params_test.dart @@ -1,4 +1,6 @@ +import 'package:llamadart/src/core/models/config/flash_attention.dart'; import 'package:llamadart/src/core/models/config/gpu_backend.dart'; +import 'package:llamadart/src/core/models/config/kv_cache_type.dart'; import 'package:llamadart/src/core/models/inference/model_params.dart'; import 'package:test/test.dart'; @@ -17,6 +19,14 @@ void main() { expect(params.batchSize, 0); expect(params.microBatchSize, 0); expect(params.maxParallelSequences, 1); + expect(params.useMmap, isTrue); + expect(params.useMlock, isFalse); + expect(params.flashAttention, FlashAttention.auto); + expect(params.cacheTypeK, KvCacheType.f16); + expect(params.cacheTypeV, KvCacheType.f16); + expect(params.kvUnified, isNull); + expect(params.ropeFrequencyBase, isNull); + expect(params.ropeFrequencyScale, isNull); expect(ModelParams.maxGpuLayers, 999); }); @@ -42,6 +52,51 @@ void main() { expect(updated.maxParallelSequences, 8); }); + test('ModelParams exposes load-time tuning knobs', () { + const params = ModelParams( + useMmap: false, + useMlock: true, + flashAttention: FlashAttention.enabled, + cacheTypeK: KvCacheType.q8_0, + cacheTypeV: KvCacheType.q8_0, + kvUnified: true, + ropeFrequencyBase: 1000000.0, + ropeFrequencyScale: 0.5, + ); + + expect(params.useMmap, isFalse); + expect(params.useMlock, isTrue); + expect(params.flashAttention, FlashAttention.enabled); + expect(params.cacheTypeK, KvCacheType.q8_0); + expect(params.cacheTypeV, KvCacheType.q8_0); + expect(params.kvUnified, isTrue); + expect(params.ropeFrequencyBase, 1000000.0); + expect(params.ropeFrequencyScale, 0.5); + }); + + test('ModelParams copyWith updates load-time tuning knobs', () { + const params = ModelParams(); + final updated = params.copyWith( + useMmap: false, + useMlock: true, + flashAttention: FlashAttention.enabled, + cacheTypeK: KvCacheType.q4_0, + cacheTypeV: KvCacheType.q8_0, + kvUnified: false, + ropeFrequencyBase: 500000.0, + ropeFrequencyScale: 0.25, + ); + + expect(updated.useMmap, isFalse); + expect(updated.useMlock, isTrue); + expect(updated.flashAttention, FlashAttention.enabled); + expect(updated.cacheTypeK, KvCacheType.q4_0); + expect(updated.cacheTypeV, KvCacheType.q8_0); + expect(updated.kvUnified, isFalse); + expect(updated.ropeFrequencyBase, 500000.0); + expect(updated.ropeFrequencyScale, 0.25); + }); + test('ModelParams copyWith preserves unspecified fields', () { const original = ModelParams( contextSize: 3072, @@ -55,6 +110,14 @@ void main() { batchSize: 512, microBatchSize: 128, maxParallelSequences: 4, + useMmap: false, + useMlock: true, + flashAttention: FlashAttention.enabled, + cacheTypeK: KvCacheType.q8_0, + cacheTypeV: KvCacheType.q8_0, + kvUnified: true, + ropeFrequencyBase: 1000000.0, + ropeFrequencyScale: 0.5, ); final updated = original.copyWith(gpuLayers: 12); @@ -70,5 +133,13 @@ void main() { expect(updated.batchSize, 512); expect(updated.microBatchSize, 128); expect(updated.maxParallelSequences, 4); + expect(updated.useMmap, isFalse); + expect(updated.useMlock, isTrue); + expect(updated.flashAttention, FlashAttention.enabled); + expect(updated.cacheTypeK, KvCacheType.q8_0); + expect(updated.cacheTypeV, KvCacheType.q8_0); + expect(updated.kvUnified, isTrue); + expect(updated.ropeFrequencyBase, 1000000.0); + expect(updated.ropeFrequencyScale, 0.5); }); } From 83f7257ec37544460b477f10c259a8d09838f092 Mon Sep 17 00:00:00 2001 From: There Is No TIme <37583483+thereisnotime@users.noreply.github.com> Date: Fri, 8 May 2026 12:28:09 +0300 Subject: [PATCH 2/6] =?UTF-8?q?fix(api):=20address=20PR=20#116=20review=20?= =?UTF-8?q?=E2=80=94=20KV/FA=20validation=20+=20clearable=20nullables?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two issues raised by maintainer review: 1. copyWith couldn't clear nullable fields back to null — `field ?? this.field` is indistinguishable from "argument omitted, keep current value". A user toggling an override off in a settings UI would be stuck with the previous value. Added explicit `clear*: bool = false` flags for all four nullable fields (chatTemplate, kvUnified, ropeFrequencyBase, ropeFrequencyScale). When the flag is set, the field becomes null regardless of any value passed for the field itself. Without the flag, behavior is unchanged (passing null still means "keep"). 2. q8_0/q4_0 KV cache types with flashAttention=disabled passed through to native and produced a cryptic llama.cpp runtime error. Added validation in the constructor that throws ArgumentError early with an actionable message. The auto-promote logic in llama_cpp_service still handles the flashAttention=auto case correctly — only the explicit `disabled` combination is rejected. Constructor lost `const` because it now has a body. Existing tests updated to use plain construction. 11 new tests cover the validation branches (5) and clear-flag behavior (6, including a regression test for the no-clear-without-flag legacy path). Total 16/16 VM tests passing. --- .../core/models/inference/model_params.dart | 39 +++++- .../models/inference/model_params_test.dart | 119 +++++++++++++++++- 2 files changed, 147 insertions(+), 11 deletions(-) diff --git a/lib/src/core/models/inference/model_params.dart b/lib/src/core/models/inference/model_params.dart index d58372ca..dc3fc78b 100644 --- a/lib/src/core/models/inference/model_params.dart +++ b/lib/src/core/models/inference/model_params.dart @@ -144,7 +144,7 @@ class ModelParams { static const int maxGpuLayers = 999; /// Creates configuration for the model. - const ModelParams({ + ModelParams({ this.contextSize = 4096, this.gpuLayers = maxGpuLayers, this.preferredBackend = GpuBackend.auto, @@ -165,9 +165,27 @@ class ModelParams { this.kvUnified, this.ropeFrequencyBase, this.ropeFrequencyScale, - }); + }) { + // llama.cpp rejects non-F16 KV cache types unless flash attention is on. + // Validate here so callers get an early Dart-side error instead of a + // cryptic native runtime failure. + if ((cacheTypeK != KvCacheType.f16 || cacheTypeV != KvCacheType.f16) && + flashAttention == FlashAttention.disabled) { + throw ArgumentError( + 'Non-F16 KV cache (cacheTypeK=$cacheTypeK, cacheTypeV=$cacheTypeV) ' + 'requires flashAttention != disabled. Either set flashAttention to ' + 'auto/enabled or use KvCacheType.f16 for both.', + ); + } + } /// Creates a copy of this [ModelParams] with updated fields. + /// + /// Nullable fields ([chatTemplate], [kvUnified], [ropeFrequencyBase], + /// [ropeFrequencyScale]) use a sentinel pattern so callers can + /// **explicitly clear them back to null** by passing the corresponding + /// `clear*: true` flag. Without the sentinel, `null` would be + /// indistinguishable from "argument omitted, keep current value". ModelParams copyWith({ int? contextSize, int? gpuLayers, @@ -176,6 +194,7 @@ class ModelParams { int? mainGpu, List? loras, String? chatTemplate, + bool clearChatTemplate = false, int? numberOfThreads, int? numberOfThreadsBatch, int? batchSize, @@ -187,8 +206,11 @@ class ModelParams { KvCacheType? cacheTypeK, KvCacheType? cacheTypeV, bool? kvUnified, + bool clearKvUnified = false, double? ropeFrequencyBase, + bool clearRopeFrequencyBase = false, double? ropeFrequencyScale, + bool clearRopeFrequencyScale = false, }) { return ModelParams( contextSize: contextSize ?? this.contextSize, @@ -197,7 +219,8 @@ class ModelParams { splitMode: splitMode ?? this.splitMode, mainGpu: mainGpu ?? this.mainGpu, loras: loras ?? this.loras, - chatTemplate: chatTemplate ?? this.chatTemplate, + chatTemplate: + clearChatTemplate ? null : (chatTemplate ?? this.chatTemplate), numberOfThreads: numberOfThreads ?? this.numberOfThreads, numberOfThreadsBatch: numberOfThreadsBatch ?? this.numberOfThreadsBatch, batchSize: batchSize ?? this.batchSize, @@ -208,9 +231,13 @@ class ModelParams { flashAttention: flashAttention ?? this.flashAttention, cacheTypeK: cacheTypeK ?? this.cacheTypeK, cacheTypeV: cacheTypeV ?? this.cacheTypeV, - kvUnified: kvUnified ?? this.kvUnified, - ropeFrequencyBase: ropeFrequencyBase ?? this.ropeFrequencyBase, - ropeFrequencyScale: ropeFrequencyScale ?? this.ropeFrequencyScale, + kvUnified: clearKvUnified ? null : (kvUnified ?? this.kvUnified), + ropeFrequencyBase: clearRopeFrequencyBase + ? null + : (ropeFrequencyBase ?? this.ropeFrequencyBase), + ropeFrequencyScale: clearRopeFrequencyScale + ? null + : (ropeFrequencyScale ?? this.ropeFrequencyScale), ); } } diff --git a/test/unit/core/models/inference/model_params_test.dart b/test/unit/core/models/inference/model_params_test.dart index aa141a5d..23f8b0a9 100644 --- a/test/unit/core/models/inference/model_params_test.dart +++ b/test/unit/core/models/inference/model_params_test.dart @@ -6,7 +6,7 @@ import 'package:test/test.dart'; void main() { test('ModelParams defaults preserve legacy context batching behavior', () { - const params = ModelParams(); + final params = ModelParams(); expect(params.contextSize, 4096); expect(params.gpuLayers, ModelParams.maxGpuLayers); @@ -31,7 +31,7 @@ void main() { }); test('ModelParams copyWith updates selected fields', () { - const params = ModelParams(contextSize: 1024); + final params = ModelParams(contextSize: 1024); final updated = params.copyWith( gpuLayers: 2, preferredBackend: GpuBackend.metal, @@ -53,7 +53,7 @@ void main() { }); test('ModelParams exposes load-time tuning knobs', () { - const params = ModelParams( + final params = ModelParams( useMmap: false, useMlock: true, flashAttention: FlashAttention.enabled, @@ -75,7 +75,7 @@ void main() { }); test('ModelParams copyWith updates load-time tuning knobs', () { - const params = ModelParams(); + final params = ModelParams(); final updated = params.copyWith( useMmap: false, useMlock: true, @@ -98,7 +98,7 @@ void main() { }); test('ModelParams copyWith preserves unspecified fields', () { - const original = ModelParams( + final original = ModelParams( contextSize: 3072, gpuLayers: 8, preferredBackend: GpuBackend.cuda, @@ -142,4 +142,113 @@ void main() { expect(updated.ropeFrequencyBase, 1000000.0); expect(updated.ropeFrequencyScale, 0.5); }); + + group('non-F16 KV requires flash attention', () { + test('q8_0 K + flashAttention disabled throws ArgumentError', () { + expect( + () => ModelParams( + cacheTypeK: KvCacheType.q8_0, + flashAttention: FlashAttention.disabled, + ), + throwsArgumentError, + ); + }); + + test('q4_0 V + flashAttention disabled throws ArgumentError', () { + expect( + () => ModelParams( + cacheTypeV: KvCacheType.q4_0, + flashAttention: FlashAttention.disabled, + ), + throwsArgumentError, + ); + }); + + test('q8_0 K/V + flashAttention auto is allowed (auto-promote handles it)', + () { + // The service-side auto-promote turns this into FA=enabled at load. + // Construction is fine. + expect( + () => ModelParams( + cacheTypeK: KvCacheType.q8_0, + cacheTypeV: KvCacheType.q8_0, + flashAttention: FlashAttention.auto, + ), + returnsNormally, + ); + }); + + test('q8_0 K/V + flashAttention enabled is allowed', () { + expect( + () => ModelParams( + cacheTypeK: KvCacheType.q8_0, + cacheTypeV: KvCacheType.q8_0, + flashAttention: FlashAttention.enabled, + ), + returnsNormally, + ); + }); + + test('F16 K/V + flashAttention disabled is allowed', () { + expect( + () => ModelParams( + flashAttention: FlashAttention.disabled, + ), + returnsNormally, + ); + }); + }); + + group('copyWith can clear nullable fields back to null', () { + final populated = ModelParams( + chatTemplate: 'custom-template', + kvUnified: true, + ropeFrequencyBase: 1000000.0, + ropeFrequencyScale: 0.5, + ); + + test('clearChatTemplate: true sets chatTemplate to null', () { + final cleared = populated.copyWith(clearChatTemplate: true); + expect(cleared.chatTemplate, isNull); + // Other fields preserved. + expect(cleared.kvUnified, isTrue); + expect(cleared.ropeFrequencyBase, 1000000.0); + }); + + test('clearKvUnified: true sets kvUnified to null', () { + final cleared = populated.copyWith(clearKvUnified: true); + expect(cleared.kvUnified, isNull); + expect(cleared.chatTemplate, 'custom-template'); + }); + + test('clearRopeFrequencyBase: true sets ropeFrequencyBase to null', () { + final cleared = populated.copyWith(clearRopeFrequencyBase: true); + expect(cleared.ropeFrequencyBase, isNull); + expect(cleared.ropeFrequencyScale, 0.5); + }); + + test('clearRopeFrequencyScale: true sets ropeFrequencyScale to null', () { + final cleared = populated.copyWith(clearRopeFrequencyScale: true); + expect(cleared.ropeFrequencyScale, isNull); + expect(cleared.ropeFrequencyBase, 1000000.0); + }); + + test('clear* flags can be combined with new value setters', () { + final updated = populated.copyWith( + clearKvUnified: true, + ropeFrequencyBase: 2000000.0, + clearRopeFrequencyScale: true, + ); + expect(updated.kvUnified, isNull); + expect(updated.ropeFrequencyBase, 2000000.0); + expect(updated.ropeFrequencyScale, isNull); + }); + + test('without clear flag, passing null does NOT clear (legacy behavior)', + () { + // This documents the pre-fix behavior: null means "argument omitted". + final unchanged = populated.copyWith(kvUnified: null); + expect(unchanged.kvUnified, isTrue); + }); + }); } From 79a23940009992f63367fa0a07ffeb13c1716cb1 Mon Sep 17 00:00:00 2001 From: There Is No TIme <37583483+thereisnotime@users.noreply.github.com> Date: Fri, 8 May 2026 12:31:38 +0300 Subject: [PATCH 3/6] test(api): cover load-path helpers for codecov target MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codecov flagged 13 uncovered lines in llama_cpp_service.dart from PR #116 (the FA auto-promote switch and the ggml_type mapping). Both are deterministic transforms that didn't deserve to live in the integration-only loadModel path. - Extracted to lib/src/backends/llama_cpp/load_param_helpers.dart as `ggmlTypeFor` and `resolveFlashAttention` — pure functions, no FFI side effects. - Service calls them in the same place as before; behaviour unchanged. - 9 unit tests in load_param_helpers_test.dart cover all 3 KV cache types × switch arms and the four FA × KV cases (auto+F16 passthrough, auto+non-F16 promote, explicit enabled passthrough, explicit disabled+F16 passthrough — disabled+non-F16 is rejected upstream by ModelParams constructor, not this helper). Patch coverage on the new code is now 100% on the testable parts; remaining uncovered lines are pure FFI struct field assignments inside loadModel (kvUnified/ropeFreq* setters), which are trivially correct and don't add value to test in isolation. --- .../backends/llama_cpp/llama_cpp_service.dart | 32 ++---- .../llama_cpp/load_param_helpers.dart | 41 ++++++++ .../llama_cpp/load_param_helpers_test.dart | 98 +++++++++++++++++++ 3 files changed, 148 insertions(+), 23 deletions(-) create mode 100644 lib/src/backends/llama_cpp/load_param_helpers.dart create mode 100644 test/unit/backends/llama_cpp/load_param_helpers_test.dart diff --git a/lib/src/backends/llama_cpp/llama_cpp_service.dart b/lib/src/backends/llama_cpp/llama_cpp_service.dart index 44a8aa69..eadbe47d 100644 --- a/lib/src/backends/llama_cpp/llama_cpp_service.dart +++ b/lib/src/backends/llama_cpp/llama_cpp_service.dart @@ -15,6 +15,7 @@ import '../../core/models/config/kv_cache_type.dart'; import '../../core/models/config/log_level.dart'; import '../../core/models/inference/generation_params.dart'; import '../../core/models/inference/model_params.dart'; +import 'load_param_helpers.dart'; import 'bindings.dart'; typedef _GgmlBackendLoadNative = ggml_backend_reg_t Function(Pointer); @@ -392,17 +393,6 @@ class LlamaCppService { normalized.contains('qwen_qwen3.5-0.8b'); } - static ggml_type _ggmlTypeFor(KvCacheType type) { - switch (type) { - case KvCacheType.f16: - return ggml_type.GGML_TYPE_F16; - case KvCacheType.q8_0: - return ggml_type.GGML_TYPE_Q8_0; - case KvCacheType.q4_0: - return ggml_type.GGML_TYPE_Q4_0; - } - } - // --- Core Methods --- /// Sets the log level for the Llama.cpp library. @@ -2539,23 +2529,19 @@ class LlamaCppService { } } - // User-explicit overrides win over the heuristics above; `auto`/null - // defaults are no-ops so existing behavior is preserved. - final wantsKvQuantization = - params.cacheTypeK != KvCacheType.f16 || - params.cacheTypeV != KvCacheType.f16; - var resolvedFlashAttn = params.flashAttention; - if (resolvedFlashAttn == FlashAttention.auto && wantsKvQuantization) { - // llama.cpp refuses non-F16 KV cache without flash attention. + final resolvedFlashAttn = resolveFlashAttention( + requested: params.flashAttention, + cacheTypeK: params.cacheTypeK, + cacheTypeV: params.cacheTypeV, + ); + if (resolvedFlashAttn != params.flashAttention) { LlamaLogger.instance.debug( 'llama_cpp_service: promoting flash_attn=enabled for non-F16 KV ' '(k=${params.cacheTypeK}, v=${params.cacheTypeV})', ); - resolvedFlashAttn = FlashAttention.enabled; } switch (resolvedFlashAttn) { case FlashAttention.auto: - // Leave whatever the heuristic above (or llama.cpp default) chose. break; case FlashAttention.enabled: ctxParams.flash_attn_typeAsInt = @@ -2566,8 +2552,8 @@ class LlamaCppService { llama_flash_attn_type.LLAMA_FLASH_ATTN_TYPE_DISABLED.value; break; } - ctxParams.type_kAsInt = _ggmlTypeFor(params.cacheTypeK).value; - ctxParams.type_vAsInt = _ggmlTypeFor(params.cacheTypeV).value; + ctxParams.type_kAsInt = ggmlTypeFor(params.cacheTypeK).value; + ctxParams.type_vAsInt = ggmlTypeFor(params.cacheTypeV).value; if (params.kvUnified != null) { ctxParams.kv_unified = params.kvUnified!; } diff --git a/lib/src/backends/llama_cpp/load_param_helpers.dart b/lib/src/backends/llama_cpp/load_param_helpers.dart new file mode 100644 index 00000000..30dbf12e --- /dev/null +++ b/lib/src/backends/llama_cpp/load_param_helpers.dart @@ -0,0 +1,41 @@ +// Pure helpers for the load path. Kept here (vs inlined in the service) so +// they can be unit-tested without going through `LlamaEngine.loadModel`, +// which is integration-level and needs a real model file. + +import '../../core/models/config/flash_attention.dart'; +import '../../core/models/config/kv_cache_type.dart'; +import 'bindings.dart'; + +/// Maps llamadart's [KvCacheType] enum to llama.cpp's `ggml_type`. Pure +/// switch, no side effects. +ggml_type ggmlTypeFor(KvCacheType type) { + switch (type) { + case KvCacheType.f16: + return ggml_type.GGML_TYPE_F16; + case KvCacheType.q8_0: + return ggml_type.GGML_TYPE_Q8_0; + case KvCacheType.q4_0: + return ggml_type.GGML_TYPE_Q4_0; + } +} + +/// Resolves the user-requested [FlashAttention] given the requested KV +/// cache types. llama.cpp refuses non-F16 KV without flash attention, so +/// `auto` is auto-promoted to `enabled` when either KV type isn't F16. +/// Explicit `enabled` / `disabled` are passed through unchanged. +/// +/// Pairing this with [ModelParams]'s constructor-side ArgumentError on +/// `(non-F16 KV, FA disabled)` ensures the only ambiguous case (`auto`) +/// gets resolved deterministically here. +FlashAttention resolveFlashAttention({ + required FlashAttention requested, + required KvCacheType cacheTypeK, + required KvCacheType cacheTypeV, +}) { + final wantsKvQuantization = + cacheTypeK != KvCacheType.f16 || cacheTypeV != KvCacheType.f16; + if (requested == FlashAttention.auto && wantsKvQuantization) { + return FlashAttention.enabled; + } + return requested; +} diff --git a/test/unit/backends/llama_cpp/load_param_helpers_test.dart b/test/unit/backends/llama_cpp/load_param_helpers_test.dart new file mode 100644 index 00000000..2f5ee712 --- /dev/null +++ b/test/unit/backends/llama_cpp/load_param_helpers_test.dart @@ -0,0 +1,98 @@ +import 'package:llamadart/src/backends/llama_cpp/bindings.dart'; +import 'package:llamadart/src/backends/llama_cpp/load_param_helpers.dart'; +import 'package:llamadart/src/core/models/config/flash_attention.dart'; +import 'package:llamadart/src/core/models/config/kv_cache_type.dart'; +import 'package:test/test.dart'; + +void main() { + group('ggmlTypeFor', () { + test('f16 → GGML_TYPE_F16', () { + expect(ggmlTypeFor(KvCacheType.f16), ggml_type.GGML_TYPE_F16); + }); + + test('q8_0 → GGML_TYPE_Q8_0', () { + expect(ggmlTypeFor(KvCacheType.q8_0), ggml_type.GGML_TYPE_Q8_0); + }); + + test('q4_0 → GGML_TYPE_Q4_0', () { + expect(ggmlTypeFor(KvCacheType.q4_0), ggml_type.GGML_TYPE_Q4_0); + }); + }); + + group('resolveFlashAttention', () { + test('auto + F16/F16 → auto (no promotion needed)', () { + expect( + resolveFlashAttention( + requested: FlashAttention.auto, + cacheTypeK: KvCacheType.f16, + cacheTypeV: KvCacheType.f16, + ), + FlashAttention.auto, + ); + }); + + test('auto + Q8_0 K → enabled (auto-promote)', () { + expect( + resolveFlashAttention( + requested: FlashAttention.auto, + cacheTypeK: KvCacheType.q8_0, + cacheTypeV: KvCacheType.f16, + ), + FlashAttention.enabled, + ); + }); + + test('auto + Q4_0 V → enabled (auto-promote)', () { + expect( + resolveFlashAttention( + requested: FlashAttention.auto, + cacheTypeK: KvCacheType.f16, + cacheTypeV: KvCacheType.q4_0, + ), + FlashAttention.enabled, + ); + }); + + test('auto + Q8_0 K/V → enabled (auto-promote)', () { + expect( + resolveFlashAttention( + requested: FlashAttention.auto, + cacheTypeK: KvCacheType.q8_0, + cacheTypeV: KvCacheType.q8_0, + ), + FlashAttention.enabled, + ); + }); + + test('explicit enabled passes through unchanged regardless of KV', () { + for (final k in KvCacheType.values) { + for (final v in KvCacheType.values) { + expect( + resolveFlashAttention( + requested: FlashAttention.enabled, + cacheTypeK: k, + cacheTypeV: v, + ), + FlashAttention.enabled, + reason: 'enabled should stay enabled for k=$k v=$v', + ); + } + } + }); + + test('explicit disabled passes through unchanged for F16 (no promotion)', + () { + // The disabled+non-F16 combination is rejected by ModelParams's + // constructor; this helper isn't responsible for that validation. + // For F16/F16, disabled is legal and should pass through. + expect( + resolveFlashAttention( + requested: FlashAttention.disabled, + cacheTypeK: KvCacheType.f16, + cacheTypeV: KvCacheType.f16, + ), + FlashAttention.disabled, + ); + }); + }); +} From e8fdbf5de6cab42d3124c5652748423f5c3aa4c6 Mon Sep 17 00:00:00 2001 From: There Is No TIme <37583483+thereisnotime@users.noreply.github.com> Date: Fri, 8 May 2026 12:51:45 +0300 Subject: [PATCH 4/6] refactor: extract loadModel param application for proper test coverage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codecov flagged 13 uncovered lines from PR #116 in loadModel — the FA switch, the ggml_type mapping, and the FFI struct setters. Previous commit moved the pure mappings to a helper but the struct-setting code was still inline in loadModel and untested. Real fix: extract `applyModelParams(mparams, params)` and `applyContextParams(ctxParams, params)` as functions that take the already-allocated FFI structs. Tests use `calloc()` and `calloc()` to build structs in pure Dart, call the helpers, then assert on field values. No model load needed. The remaining loadModel code is two function calls plus a one-line log when FA was auto-promoted — trivially correct. 20 tests in load_param_helpers_test.dart now cover: - ggmlTypeFor: all 3 enum branches - resolveFlashAttention: auto/enabled/disabled × F16/non-F16 matrix - applyModelParams: writes use_mmap + use_mlock from params (default + overridden) - applyContextParams: writes type_k/type_v, FA enabled/disabled, FA auto-promote on Q8 KV, kvUnified null preserves struct field unchanged + non-null writes through, ropeFreq* same semantics, return value matches resolved FA Verified locally with `dart pub global run coverage:format_coverage`: load_param_helpers.dart hits LH=LF on every line. The two remaining uncovered lines from the patch are pure FFI imports + the helper call sites in loadModel itself, which can't be tested without loading a real model — they're trivially safe (1-line forwarding calls). --- .../backends/llama_cpp/llama_cpp_service.dart | 34 +--- .../llama_cpp/load_param_helpers.dart | 46 +++++ .../llama_cpp/load_param_helpers_test.dart | 162 ++++++++++++++++++ 3 files changed, 210 insertions(+), 32 deletions(-) diff --git a/lib/src/backends/llama_cpp/llama_cpp_service.dart b/lib/src/backends/llama_cpp/llama_cpp_service.dart index eadbe47d..09edc049 100644 --- a/lib/src/backends/llama_cpp/llama_cpp_service.dart +++ b/lib/src/backends/llama_cpp/llama_cpp_service.dart @@ -9,9 +9,7 @@ import 'package:path/path.dart' as path; import '../../core/llama_logger.dart'; import '../../core/models/chat/content_part.dart'; -import '../../core/models/config/flash_attention.dart'; import '../../core/models/config/gpu_backend.dart'; -import '../../core/models/config/kv_cache_type.dart'; import '../../core/models/config/log_level.dart'; import '../../core/models/inference/generation_params.dart'; import '../../core/models/inference/model_params.dart'; @@ -1271,8 +1269,7 @@ class LlamaCppService { mparams.n_gpu_layers = gpuLayers; mparams.split_modeAsInt = modelParams.splitMode.llamaCppValue; mparams.main_gpu = modelParams.mainGpu; - mparams.use_mmap = modelParams.useMmap; - mparams.use_mlock = modelParams.useMlock; + applyModelParams(mparams, modelParams); if (preferredDevices != null) { mparams.devices = preferredDevices; } @@ -2529,40 +2526,13 @@ class LlamaCppService { } } - final resolvedFlashAttn = resolveFlashAttention( - requested: params.flashAttention, - cacheTypeK: params.cacheTypeK, - cacheTypeV: params.cacheTypeV, - ); + final resolvedFlashAttn = applyContextParams(ctxParams, params); if (resolvedFlashAttn != params.flashAttention) { LlamaLogger.instance.debug( 'llama_cpp_service: promoting flash_attn=enabled for non-F16 KV ' '(k=${params.cacheTypeK}, v=${params.cacheTypeV})', ); } - switch (resolvedFlashAttn) { - case FlashAttention.auto: - break; - case FlashAttention.enabled: - ctxParams.flash_attn_typeAsInt = - llama_flash_attn_type.LLAMA_FLASH_ATTN_TYPE_ENABLED.value; - break; - case FlashAttention.disabled: - ctxParams.flash_attn_typeAsInt = - llama_flash_attn_type.LLAMA_FLASH_ATTN_TYPE_DISABLED.value; - break; - } - ctxParams.type_kAsInt = ggmlTypeFor(params.cacheTypeK).value; - ctxParams.type_vAsInt = ggmlTypeFor(params.cacheTypeV).value; - if (params.kvUnified != null) { - ctxParams.kv_unified = params.kvUnified!; - } - if (params.ropeFrequencyBase != null) { - ctxParams.rope_freq_base = params.ropeFrequencyBase!; - } - if (params.ropeFrequencyScale != null) { - ctxParams.rope_freq_scale = params.ropeFrequencyScale!; - } final ctxPtr = llama_init_from_model(model.pointer, ctxParams); if (ctxPtr == nullptr) { diff --git a/lib/src/backends/llama_cpp/load_param_helpers.dart b/lib/src/backends/llama_cpp/load_param_helpers.dart index 30dbf12e..97f3bfa2 100644 --- a/lib/src/backends/llama_cpp/load_param_helpers.dart +++ b/lib/src/backends/llama_cpp/load_param_helpers.dart @@ -4,6 +4,7 @@ import '../../core/models/config/flash_attention.dart'; import '../../core/models/config/kv_cache_type.dart'; +import '../../core/models/inference/model_params.dart'; import 'bindings.dart'; /// Maps llamadart's [KvCacheType] enum to llama.cpp's `ggml_type`. Pure @@ -39,3 +40,48 @@ FlashAttention resolveFlashAttention({ } return requested; } + +/// Applies the user-controlled fields of [params] to a freshly-defaulted +/// `llama_model_params` struct. Pure function: caller is responsible for +/// initialising and freeing the struct. +void applyModelParams(llama_model_params mparams, ModelParams params) { + mparams.use_mmap = params.useMmap; + mparams.use_mlock = params.useMlock; +} + +/// Applies the user-controlled fields of [params] to a `llama_context_params` +/// struct. Honours the `auto` → `enabled` flash-attention promotion via +/// [resolveFlashAttention]. Returns the resolved [FlashAttention] so the +/// caller can log whether a promotion occurred. +FlashAttention applyContextParams( + llama_context_params ctxParams, ModelParams params) { + final resolvedFlashAttn = resolveFlashAttention( + requested: params.flashAttention, + cacheTypeK: params.cacheTypeK, + cacheTypeV: params.cacheTypeV, + ); + switch (resolvedFlashAttn) { + case FlashAttention.auto: + break; + case FlashAttention.enabled: + ctxParams.flash_attn_typeAsInt = + llama_flash_attn_type.LLAMA_FLASH_ATTN_TYPE_ENABLED.value; + break; + case FlashAttention.disabled: + ctxParams.flash_attn_typeAsInt = + llama_flash_attn_type.LLAMA_FLASH_ATTN_TYPE_DISABLED.value; + break; + } + ctxParams.type_kAsInt = ggmlTypeFor(params.cacheTypeK).value; + ctxParams.type_vAsInt = ggmlTypeFor(params.cacheTypeV).value; + if (params.kvUnified != null) { + ctxParams.kv_unified = params.kvUnified!; + } + if (params.ropeFrequencyBase != null) { + ctxParams.rope_freq_base = params.ropeFrequencyBase!; + } + if (params.ropeFrequencyScale != null) { + ctxParams.rope_freq_scale = params.ropeFrequencyScale!; + } + return resolvedFlashAttn; +} diff --git a/test/unit/backends/llama_cpp/load_param_helpers_test.dart b/test/unit/backends/llama_cpp/load_param_helpers_test.dart index 2f5ee712..5d6289ba 100644 --- a/test/unit/backends/llama_cpp/load_param_helpers_test.dart +++ b/test/unit/backends/llama_cpp/load_param_helpers_test.dart @@ -1,7 +1,11 @@ +import 'dart:ffi'; + +import 'package:ffi/ffi.dart'; import 'package:llamadart/src/backends/llama_cpp/bindings.dart'; import 'package:llamadart/src/backends/llama_cpp/load_param_helpers.dart'; import 'package:llamadart/src/core/models/config/flash_attention.dart'; import 'package:llamadart/src/core/models/config/kv_cache_type.dart'; +import 'package:llamadart/src/core/models/inference/model_params.dart'; import 'package:test/test.dart'; void main() { @@ -95,4 +99,162 @@ void main() { ); }); }); + + group('applyModelParams', () { + test('writes use_mmap and use_mlock from params', () { + final m = calloc(); + try { + applyModelParams(m.ref, ModelParams(useMmap: false, useMlock: true)); + expect(m.ref.use_mmap, isFalse); + expect(m.ref.use_mlock, isTrue); + } finally { + calloc.free(m); + } + }); + + test('default ModelParams writes mmap=true, mlock=false', () { + final m = calloc(); + try { + applyModelParams(m.ref, ModelParams()); + expect(m.ref.use_mmap, isTrue); + expect(m.ref.use_mlock, isFalse); + } finally { + calloc.free(m); + } + }); + }); + + group('applyContextParams', () { + test('writes type_k/type_v from cacheTypeK/V', () { + final c = calloc(); + try { + applyContextParams( + c.ref, + ModelParams( + cacheTypeK: KvCacheType.q8_0, + cacheTypeV: KvCacheType.q4_0, + flashAttention: FlashAttention.enabled, + )); + expect(c.ref.type_kAsInt, ggml_type.GGML_TYPE_Q8_0.value); + expect(c.ref.type_vAsInt, ggml_type.GGML_TYPE_Q4_0.value); + } finally { + calloc.free(c); + } + }); + + test('explicit FA enabled writes ENABLED', () { + final c = calloc(); + try { + applyContextParams( + c.ref, + ModelParams(flashAttention: FlashAttention.enabled), + ); + expect( + c.ref.flash_attn_typeAsInt, + llama_flash_attn_type.LLAMA_FLASH_ATTN_TYPE_ENABLED.value, + ); + } finally { + calloc.free(c); + } + }); + + test('explicit FA disabled (with F16 KV) writes DISABLED', () { + final c = calloc(); + try { + applyContextParams( + c.ref, + ModelParams(flashAttention: FlashAttention.disabled), + ); + expect( + c.ref.flash_attn_typeAsInt, + llama_flash_attn_type.LLAMA_FLASH_ATTN_TYPE_DISABLED.value, + ); + } finally { + calloc.free(c); + } + }); + + test('FA auto + Q8 KV auto-promotes to ENABLED in the struct', () { + final c = calloc(); + try { + final resolved = applyContextParams( + c.ref, + ModelParams( + cacheTypeK: KvCacheType.q8_0, + cacheTypeV: KvCacheType.q8_0, + ), + ); + expect(resolved, FlashAttention.enabled); + expect( + c.ref.flash_attn_typeAsInt, + llama_flash_attn_type.LLAMA_FLASH_ATTN_TYPE_ENABLED.value, + ); + } finally { + calloc.free(c); + } + }); + + test('null kvUnified leaves struct field unchanged', () { + final c = calloc(); + try { + c.ref.kv_unified = false; + applyContextParams(c.ref, ModelParams()); + expect(c.ref.kv_unified, isFalse); + } finally { + calloc.free(c); + } + }); + + test('non-null kvUnified writes the value', () { + final c = calloc(); + try { + applyContextParams(c.ref, ModelParams(kvUnified: true)); + expect(c.ref.kv_unified, isTrue); + } finally { + calloc.free(c); + } + }); + + test('null ropeFrequencyBase / Scale leaves struct fields unchanged', () { + final c = calloc(); + try { + c.ref.rope_freq_base = 12345.0; + c.ref.rope_freq_scale = 0.5; // fp32-exact value + applyContextParams(c.ref, ModelParams()); + expect(c.ref.rope_freq_base, 12345.0); + expect(c.ref.rope_freq_scale, 0.5); + } finally { + calloc.free(c); + } + }); + + test('non-null rope frequencies write through', () { + final c = calloc(); + try { + applyContextParams( + c.ref, + ModelParams( + ropeFrequencyBase: 500000.0, + ropeFrequencyScale: 0.25, // fp32-exact value + )); + expect(c.ref.rope_freq_base, 500000.0); + expect(c.ref.rope_freq_scale, 0.25); + } finally { + calloc.free(c); + } + }); + + test('returns the resolved FlashAttention value', () { + final c = calloc(); + try { + expect( + applyContextParams( + c.ref, ModelParams(flashAttention: FlashAttention.enabled)), + FlashAttention.enabled, + ); + } finally { + calloc.free(c); + } + }); + }); } From 0b74b6837438b3480508022a9e32ce9550dc486e Mon Sep 17 00:00:00 2001 From: There Is No TIme <37583483+thereisnotime@users.noreply.github.com> Date: Fri, 8 May 2026 13:29:53 +0300 Subject: [PATCH 5/6] fix: restore const ModelParams constructor, move validation to validate() My previous review-fix commit (83f7257) added the FA/KV validation in the constructor body, which forced removing `const`. That broke existing `const ModelParams()` defaults in llamadart's own engine.dart (line 132, 172) plus any external caller using a const context. Now: const constructor restored. New `ModelParams.validate()` method checks the same invariant; LlamaCppService.loadModel calls it before any native work so users still get the early Dart-side ArgumentError the maintainer asked for, without breaking backwards-compat for const callers. Tests updated: validate() can be called on const-constructed instances, returns normally for valid combos, throws ArgumentError for the (non-F16 KV, FA disabled) combo. 36/36 VM tests passing. --- .../backends/llama_cpp/llama_cpp_service.dart | 1 + .../core/models/inference/model_params.dart | 18 +++-- .../models/inference/model_params_test.dart | 71 ++++++++----------- 3 files changed, 41 insertions(+), 49 deletions(-) diff --git a/lib/src/backends/llama_cpp/llama_cpp_service.dart b/lib/src/backends/llama_cpp/llama_cpp_service.dart index 09edc049..dc4c8fe3 100644 --- a/lib/src/backends/llama_cpp/llama_cpp_service.dart +++ b/lib/src/backends/llama_cpp/llama_cpp_service.dart @@ -2526,6 +2526,7 @@ class LlamaCppService { } } + params.validate(); final resolvedFlashAttn = applyContextParams(ctxParams, params); if (resolvedFlashAttn != params.flashAttention) { LlamaLogger.instance.debug( diff --git a/lib/src/core/models/inference/model_params.dart b/lib/src/core/models/inference/model_params.dart index dc3fc78b..01f4475f 100644 --- a/lib/src/core/models/inference/model_params.dart +++ b/lib/src/core/models/inference/model_params.dart @@ -143,8 +143,9 @@ class ModelParams { /// Maximum number of GPU layers to safely offload all layers. static const int maxGpuLayers = 999; - /// Creates configuration for the model. - ModelParams({ + /// Creates configuration for the model. Use [validate] to check for + /// llama.cpp-incompatible combinations before passing to a load call. + const ModelParams({ this.contextSize = 4096, this.gpuLayers = maxGpuLayers, this.preferredBackend = GpuBackend.auto, @@ -165,10 +166,15 @@ class ModelParams { this.kvUnified, this.ropeFrequencyBase, this.ropeFrequencyScale, - }) { - // llama.cpp rejects non-F16 KV cache types unless flash attention is on. - // Validate here so callers get an early Dart-side error instead of a - // cryptic native runtime failure. + }); + + /// Validates the parameter combination. Throws [ArgumentError] when the + /// combination is incompatible with llama.cpp (currently: non-F16 KV + /// cache requires flashAttention != disabled). Called automatically by + /// `LlamaCppService.loadModel` before the native call so callers don't + /// have to remember it; exposed publicly so callers who construct + /// `ModelParams` defensively can validate up-front. + void validate() { if ((cacheTypeK != KvCacheType.f16 || cacheTypeV != KvCacheType.f16) && flashAttention == FlashAttention.disabled) { throw ArgumentError( diff --git a/test/unit/core/models/inference/model_params_test.dart b/test/unit/core/models/inference/model_params_test.dart index 23f8b0a9..ebc0894d 100644 --- a/test/unit/core/models/inference/model_params_test.dart +++ b/test/unit/core/models/inference/model_params_test.dart @@ -6,7 +6,7 @@ import 'package:test/test.dart'; void main() { test('ModelParams defaults preserve legacy context batching behavior', () { - final params = ModelParams(); + const params = ModelParams(); expect(params.contextSize, 4096); expect(params.gpuLayers, ModelParams.maxGpuLayers); @@ -31,7 +31,7 @@ void main() { }); test('ModelParams copyWith updates selected fields', () { - final params = ModelParams(contextSize: 1024); + const params = ModelParams(contextSize: 1024); final updated = params.copyWith( gpuLayers: 2, preferredBackend: GpuBackend.metal, @@ -53,7 +53,7 @@ void main() { }); test('ModelParams exposes load-time tuning knobs', () { - final params = ModelParams( + const params = ModelParams( useMmap: false, useMlock: true, flashAttention: FlashAttention.enabled, @@ -75,7 +75,7 @@ void main() { }); test('ModelParams copyWith updates load-time tuning knobs', () { - final params = ModelParams(); + const params = ModelParams(); final updated = params.copyWith( useMmap: false, useMlock: true, @@ -98,7 +98,7 @@ void main() { }); test('ModelParams copyWith preserves unspecified fields', () { - final original = ModelParams( + const original = ModelParams( contextSize: 3072, gpuLayers: 8, preferredBackend: GpuBackend.cuda, @@ -143,64 +143,49 @@ void main() { expect(updated.ropeFrequencyScale, 0.5); }); - group('non-F16 KV requires flash attention', () { + group('validate(): non-F16 KV requires flash attention', () { test('q8_0 K + flashAttention disabled throws ArgumentError', () { - expect( - () => ModelParams( - cacheTypeK: KvCacheType.q8_0, - flashAttention: FlashAttention.disabled, - ), - throwsArgumentError, + const p = ModelParams( + cacheTypeK: KvCacheType.q8_0, + flashAttention: FlashAttention.disabled, ); + expect(p.validate, throwsArgumentError); }); test('q4_0 V + flashAttention disabled throws ArgumentError', () { - expect( - () => ModelParams( - cacheTypeV: KvCacheType.q4_0, - flashAttention: FlashAttention.disabled, - ), - throwsArgumentError, + const p = ModelParams( + cacheTypeV: KvCacheType.q4_0, + flashAttention: FlashAttention.disabled, ); + expect(p.validate, throwsArgumentError); }); - test('q8_0 K/V + flashAttention auto is allowed (auto-promote handles it)', - () { - // The service-side auto-promote turns this into FA=enabled at load. - // Construction is fine. - expect( - () => ModelParams( - cacheTypeK: KvCacheType.q8_0, - cacheTypeV: KvCacheType.q8_0, - flashAttention: FlashAttention.auto, - ), - returnsNormally, + test('q8_0 K/V + flashAttention auto is allowed', () { + const p = ModelParams( + cacheTypeK: KvCacheType.q8_0, + cacheTypeV: KvCacheType.q8_0, + flashAttention: FlashAttention.auto, ); + expect(p.validate, returnsNormally); }); test('q8_0 K/V + flashAttention enabled is allowed', () { - expect( - () => ModelParams( - cacheTypeK: KvCacheType.q8_0, - cacheTypeV: KvCacheType.q8_0, - flashAttention: FlashAttention.enabled, - ), - returnsNormally, + const p = ModelParams( + cacheTypeK: KvCacheType.q8_0, + cacheTypeV: KvCacheType.q8_0, + flashAttention: FlashAttention.enabled, ); + expect(p.validate, returnsNormally); }); test('F16 K/V + flashAttention disabled is allowed', () { - expect( - () => ModelParams( - flashAttention: FlashAttention.disabled, - ), - returnsNormally, - ); + const p = ModelParams(flashAttention: FlashAttention.disabled); + expect(p.validate, returnsNormally); }); }); group('copyWith can clear nullable fields back to null', () { - final populated = ModelParams( + const populated = ModelParams( chatTemplate: 'custom-template', kvUnified: true, ropeFrequencyBase: 1000000.0, From b794aec0014f8545c993ce158c3fcfb910f590e5 Mon Sep 17 00:00:00 2001 From: Jhin Lee Date: Fri, 8 May 2026 10:34:22 -0400 Subject: [PATCH 6/6] fix(api): address load param test CI failures --- .../llama_cpp/load_param_helpers.dart | 4 +- .../core/models/inference/model_params.dart | 5 +- .../llama_cpp/load_param_helpers_test.dart | 61 +++++++++++-------- .../models/inference/model_params_test.dart | 14 +++-- 4 files changed, 49 insertions(+), 35 deletions(-) diff --git a/lib/src/backends/llama_cpp/load_param_helpers.dart b/lib/src/backends/llama_cpp/load_param_helpers.dart index 97f3bfa2..ab8ac155 100644 --- a/lib/src/backends/llama_cpp/load_param_helpers.dart +++ b/lib/src/backends/llama_cpp/load_param_helpers.dart @@ -54,7 +54,9 @@ void applyModelParams(llama_model_params mparams, ModelParams params) { /// [resolveFlashAttention]. Returns the resolved [FlashAttention] so the /// caller can log whether a promotion occurred. FlashAttention applyContextParams( - llama_context_params ctxParams, ModelParams params) { + llama_context_params ctxParams, + ModelParams params, +) { final resolvedFlashAttn = resolveFlashAttention( requested: params.flashAttention, cacheTypeK: params.cacheTypeK, diff --git a/lib/src/core/models/inference/model_params.dart b/lib/src/core/models/inference/model_params.dart index 01f4475f..5926cf08 100644 --- a/lib/src/core/models/inference/model_params.dart +++ b/lib/src/core/models/inference/model_params.dart @@ -225,8 +225,9 @@ class ModelParams { splitMode: splitMode ?? this.splitMode, mainGpu: mainGpu ?? this.mainGpu, loras: loras ?? this.loras, - chatTemplate: - clearChatTemplate ? null : (chatTemplate ?? this.chatTemplate), + chatTemplate: clearChatTemplate + ? null + : (chatTemplate ?? this.chatTemplate), numberOfThreads: numberOfThreads ?? this.numberOfThreads, numberOfThreadsBatch: numberOfThreadsBatch ?? this.numberOfThreadsBatch, batchSize: batchSize ?? this.batchSize, diff --git a/test/unit/backends/llama_cpp/load_param_helpers_test.dart b/test/unit/backends/llama_cpp/load_param_helpers_test.dart index 5d6289ba..b8b285bb 100644 --- a/test/unit/backends/llama_cpp/load_param_helpers_test.dart +++ b/test/unit/backends/llama_cpp/load_param_helpers_test.dart @@ -1,3 +1,6 @@ +@TestOn('vm') +library; + import 'dart:ffi'; import 'package:ffi/ffi.dart'; @@ -84,20 +87,22 @@ void main() { } }); - test('explicit disabled passes through unchanged for F16 (no promotion)', - () { - // The disabled+non-F16 combination is rejected by ModelParams's - // constructor; this helper isn't responsible for that validation. - // For F16/F16, disabled is legal and should pass through. - expect( - resolveFlashAttention( - requested: FlashAttention.disabled, - cacheTypeK: KvCacheType.f16, - cacheTypeV: KvCacheType.f16, - ), - FlashAttention.disabled, - ); - }); + test( + 'explicit disabled passes through unchanged for F16 (no promotion)', + () { + // The disabled+non-F16 combination is rejected by ModelParams's + // constructor; this helper isn't responsible for that validation. + // For F16/F16, disabled is legal and should pass through. + expect( + resolveFlashAttention( + requested: FlashAttention.disabled, + cacheTypeK: KvCacheType.f16, + cacheTypeV: KvCacheType.f16, + ), + FlashAttention.disabled, + ); + }, + ); }); group('applyModelParams', () { @@ -129,12 +134,13 @@ void main() { final c = calloc(); try { applyContextParams( - c.ref, - ModelParams( - cacheTypeK: KvCacheType.q8_0, - cacheTypeV: KvCacheType.q4_0, - flashAttention: FlashAttention.enabled, - )); + c.ref, + ModelParams( + cacheTypeK: KvCacheType.q8_0, + cacheTypeV: KvCacheType.q4_0, + flashAttention: FlashAttention.enabled, + ), + ); expect(c.ref.type_kAsInt, ggml_type.GGML_TYPE_Q8_0.value); expect(c.ref.type_vAsInt, ggml_type.GGML_TYPE_Q4_0.value); } finally { @@ -232,11 +238,12 @@ void main() { final c = calloc(); try { applyContextParams( - c.ref, - ModelParams( - ropeFrequencyBase: 500000.0, - ropeFrequencyScale: 0.25, // fp32-exact value - )); + c.ref, + ModelParams( + ropeFrequencyBase: 500000.0, + ropeFrequencyScale: 0.25, // fp32-exact value + ), + ); expect(c.ref.rope_freq_base, 500000.0); expect(c.ref.rope_freq_scale, 0.25); } finally { @@ -249,7 +256,9 @@ void main() { try { expect( applyContextParams( - c.ref, ModelParams(flashAttention: FlashAttention.enabled)), + c.ref, + ModelParams(flashAttention: FlashAttention.enabled), + ), FlashAttention.enabled, ); } finally { diff --git a/test/unit/core/models/inference/model_params_test.dart b/test/unit/core/models/inference/model_params_test.dart index ebc0894d..f7ea7bf8 100644 --- a/test/unit/core/models/inference/model_params_test.dart +++ b/test/unit/core/models/inference/model_params_test.dart @@ -229,11 +229,13 @@ void main() { expect(updated.ropeFrequencyScale, isNull); }); - test('without clear flag, passing null does NOT clear (legacy behavior)', - () { - // This documents the pre-fix behavior: null means "argument omitted". - final unchanged = populated.copyWith(kvUnified: null); - expect(unchanged.kvUnified, isTrue); - }); + test( + 'without clear flag, passing null does NOT clear (legacy behavior)', + () { + // This documents the pre-fix behavior: null means "argument omitted". + final unchanged = populated.copyWith(kvUnified: null); + expect(unchanged.kvUnified, isTrue); + }, + ); }); }