leehack · leehack · May 8, 2026 · May 7, 2026 · May 8, 2026 · May 8, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,31 @@
 * **Native runtime sync**:
   * Updated native hook pinning to `leehack/llamadart-native@b9016`,
     picking up the CUDA 12.8 Blackwell-capable native bundles.
+* **Load-time tuning knobs**:
+  * Added `ModelParams.useMmap` (default `true`) and
+    `ModelParams.useMlock` (default `false`), wired to
+    `llama_model_params.use_mmap` / `use_mlock`. Lets callers turn off mmap
+    for platforms where memory-mapped weights hurt throughput, or pin
+    weights in RAM to avoid first-token paging spikes.
+  * Added `ModelParams.flashAttention` with the `FlashAttention.{auto,
+    enabled, disabled}` enum, wired to
+    `llama_context_params.flash_attn_type`. Explicit settings win over the
+    existing automatic Android/Vulkan heuristics; `auto` preserves prior
+    behavior.
+  * Added `ModelParams.cacheTypeK` and `ModelParams.cacheTypeV` with the
+    `KvCacheType.{f16, q8_0, q4_0}` enum, wired to
+    `llama_context_params.type_k` / `type_v`. Enables KV-cache
+    quantization (Q8_0 ≈ halves KV memory; Q4_0 ≈ quarters it). When the
+    user requests a non-F16 KV type with `flashAttention: auto`, the
+    service auto-promotes flash attention to enabled — llama.cpp requires
+    it for KV quantization.
+  * Added `ModelParams.kvUnified` (nullable) for explicit override of
+    `llama_context_params.kv_unified`. `null` keeps the existing
+    auto-enable-when-multi-sequence behavior.
+  * Added `ModelParams.ropeFrequencyBase` and
+    `ModelParams.ropeFrequencyScale` (both nullable) for
+    context-extension overrides on `llama_context_params.rope_freq_base` /
+    `rope_freq_scale`. `null` keeps the model's trained values.
 * **GPU device selection API**:
   * Added `ModelParams.mainGpu` and wired it to llama.cpp
     `llama_model_params.main_gpu`.

diff --git a/lib/llamadart.dart b/lib/llamadart.dart
@@ -69,6 +69,8 @@ export 'src/core/models/tools/tool_params.dart';
 export 'src/core/llama_logger.dart';
 export 'src/core/models/config/log_level.dart';
 export 'src/core/models/config/gpu_backend.dart';
+export 'src/core/models/config/flash_attention.dart';
+export 'src/core/models/config/kv_cache_type.dart';
 export 'src/core/models/config/lora_config.dart';
 
 // Utils

diff --git a/lib/src/backends/llama_cpp/llama_cpp_service.dart b/lib/src/backends/llama_cpp/llama_cpp_service.dart
@@ -7,11 +7,13 @@ import 'dart:math' as math;
 import 'package:ffi/ffi.dart';
 import 'package:path/path.dart' as path;
 
+import '../../core/llama_logger.dart';
 import '../../core/models/chat/content_part.dart';
 import '../../core/models/config/gpu_backend.dart';
 import '../../core/models/config/log_level.dart';
 import '../../core/models/inference/generation_params.dart';
 import '../../core/models/inference/model_params.dart';
+import 'load_param_helpers.dart';
 import 'bindings.dart';
 
 typedef _GgmlBackendLoadNative = ggml_backend_reg_t Function(Pointer<Char>);
@@ -1267,7 +1269,7 @@ class LlamaCppService {
     mparams.n_gpu_layers = gpuLayers;
     mparams.split_modeAsInt = modelParams.splitMode.llamaCppValue;
     mparams.main_gpu = modelParams.mainGpu;
-    mparams.use_mmap = true;
+    applyModelParams(mparams, modelParams);
     if (preferredDevices != null) {
       mparams.devices = preferredDevices;
     }
@@ -2524,6 +2526,15 @@ class LlamaCppService {
       }
     }
 
+    params.validate();
+    final resolvedFlashAttn = applyContextParams(ctxParams, params);
+    if (resolvedFlashAttn != params.flashAttention) {
+      LlamaLogger.instance.debug(
+        'llama_cpp_service: promoting flash_attn=enabled for non-F16 KV '
+        '(k=${params.cacheTypeK}, v=${params.cacheTypeV})',
+      );
+    }
+
     final ctxPtr = llama_init_from_model(model.pointer, ctxParams);
     if (ctxPtr == nullptr) {
       throw Exception("Failed to create context");

diff --git a/lib/src/backends/llama_cpp/load_param_helpers.dart b/lib/src/backends/llama_cpp/load_param_helpers.dart
@@ -0,0 +1,89 @@
+// Pure helpers for the load path. Kept here (vs inlined in the service) so
+// they can be unit-tested without going through `LlamaEngine.loadModel`,
+// which is integration-level and needs a real model file.
+
+import '../../core/models/config/flash_attention.dart';
+import '../../core/models/config/kv_cache_type.dart';
+import '../../core/models/inference/model_params.dart';
+import 'bindings.dart';
+
+/// Maps llamadart's [KvCacheType] enum to llama.cpp's `ggml_type`. Pure
+/// switch, no side effects.
+ggml_type ggmlTypeFor(KvCacheType type) {
+  switch (type) {
+    case KvCacheType.f16:
+      return ggml_type.GGML_TYPE_F16;
+    case KvCacheType.q8_0:
+      return ggml_type.GGML_TYPE_Q8_0;
+    case KvCacheType.q4_0:
+      return ggml_type.GGML_TYPE_Q4_0;
+  }
+}
+
+/// Resolves the user-requested [FlashAttention] given the requested KV
+/// cache types. llama.cpp refuses non-F16 KV without flash attention, so
+/// `auto` is auto-promoted to `enabled` when either KV type isn't F16.
+/// Explicit `enabled` / `disabled` are passed through unchanged.
+///
+/// Pairing this with [ModelParams]'s constructor-side ArgumentError on
+/// `(non-F16 KV, FA disabled)` ensures the only ambiguous case (`auto`)
+/// gets resolved deterministically here.
+FlashAttention resolveFlashAttention({
+  required FlashAttention requested,
+  required KvCacheType cacheTypeK,
+  required KvCacheType cacheTypeV,
+}) {
+  final wantsKvQuantization =
+      cacheTypeK != KvCacheType.f16 || cacheTypeV != KvCacheType.f16;
+  if (requested == FlashAttention.auto && wantsKvQuantization) {
+    return FlashAttention.enabled;
+  }
+  return requested;
+}
+
+/// Applies the user-controlled fields of [params] to a freshly-defaulted
+/// `llama_model_params` struct. Pure function: caller is responsible for
+/// initialising and freeing the struct.
+void applyModelParams(llama_model_params mparams, ModelParams params) {
+  mparams.use_mmap = params.useMmap;
+  mparams.use_mlock = params.useMlock;
+}
+
+/// Applies the user-controlled fields of [params] to a `llama_context_params`
+/// struct. Honours the `auto` → `enabled` flash-attention promotion via
+/// [resolveFlashAttention]. Returns the resolved [FlashAttention] so the
+/// caller can log whether a promotion occurred.
+FlashAttention applyContextParams(
+  llama_context_params ctxParams,
+  ModelParams params,
+) {
+  final resolvedFlashAttn = resolveFlashAttention(
+    requested: params.flashAttention,
+    cacheTypeK: params.cacheTypeK,
+    cacheTypeV: params.cacheTypeV,
+  );
+  switch (resolvedFlashAttn) {
+    case FlashAttention.auto:
+      break;
+    case FlashAttention.enabled:
+      ctxParams.flash_attn_typeAsInt =
+          llama_flash_attn_type.LLAMA_FLASH_ATTN_TYPE_ENABLED.value;
+      break;
+    case FlashAttention.disabled:
+      ctxParams.flash_attn_typeAsInt =
+          llama_flash_attn_type.LLAMA_FLASH_ATTN_TYPE_DISABLED.value;
+      break;
+  }
+  ctxParams.type_kAsInt = ggmlTypeFor(params.cacheTypeK).value;
+  ctxParams.type_vAsInt = ggmlTypeFor(params.cacheTypeV).value;
+  if (params.kvUnified != null) {
+    ctxParams.kv_unified = params.kvUnified!;
+  }
+  if (params.ropeFrequencyBase != null) {
+    ctxParams.rope_freq_base = params.ropeFrequencyBase!;
+  }
+  if (params.ropeFrequencyScale != null) {
+    ctxParams.rope_freq_scale = params.ropeFrequencyScale!;
+  }
+  return resolvedFlashAttn;
+}
diff --git a/lib/src/core/models/config/flash_attention.dart b/lib/src/core/models/config/flash_attention.dart
@@ -0,0 +1,12 @@
+/// Selects llama.cpp's `flash_attn_type`. Required when [KvCacheType] is
+/// not [KvCacheType.f16] — llama.cpp refuses non-F16 KV cache without it.
+enum FlashAttention {
+  /// Let llama.cpp pick.
+  auto,
+
+  /// Force on.
+  enabled,
+
+  /// Force off.
+  disabled,
+}
diff --git a/lib/src/core/models/config/kv_cache_type.dart b/lib/src/core/models/config/kv_cache_type.dart
@@ -0,0 +1,13 @@
+/// KV-cache data type for `llama_context_params.type_k` / `type_v`.
+/// q8_0 ≈ 0.5× the KV memory of f16; q4_0 ≈ 0.25×. Both require flash
+/// attention to be enabled (see [FlashAttention]).
+enum KvCacheType {
+  /// fp16 (default).
+  f16,
+
+  /// 8-bit quantized.
+  q8_0,
+
+  /// 4-bit quantized.
+  q4_0,
+}
diff --git a/lib/src/core/models/inference/model_params.dart b/lib/src/core/models/inference/model_params.dart
@@ -1,5 +1,6 @@
+import '../config/flash_attention.dart';
 import '../config/gpu_backend.dart';
-
+import '../config/kv_cache_type.dart';
 import '../config/lora_config.dart';
 
 /// Strategy for distributing model tensors across GPU devices.
@@ -111,10 +112,39 @@ class ModelParams {
   /// Set to 1 to preserve single-sequence behavior.
   final int maxParallelSequences;
 
+  /// `llama_model_params.use_mmap`. Default `true`.
+  final bool useMmap;
+
+  /// `llama_model_params.use_mlock`. Default `false`.
+  final bool useMlock;
+
+  /// `llama_context_params.flash_attn_type`. User-explicit values override
+  /// the platform/backend heuristic.
+  final FlashAttention flashAttention;
+
+  /// `llama_context_params.type_k`. Non-F16 requires [flashAttention] enabled.
+  final KvCacheType cacheTypeK;
+
+  /// `llama_context_params.type_v`. Non-F16 requires [flashAttention] enabled.
+  final KvCacheType cacheTypeV;
+
+  /// `llama_context_params.kv_unified`. `null` keeps the current heuristic
+  /// (auto-enabled when [maxParallelSequences] > 1).
+  final bool? kvUnified;
+
+  /// `llama_context_params.rope_freq_base`. `null` keeps the model's
+  /// trained value.
+  final double? ropeFrequencyBase;
+
+  /// `llama_context_params.rope_freq_scale`. `null` keeps the model's
+  /// trained value.
+  final double? ropeFrequencyScale;
+
   /// Maximum number of GPU layers to safely offload all layers.
   static const int maxGpuLayers = 999;
 
-  /// Creates configuration for the model.
+  /// Creates configuration for the model. Use [validate] to check for
+  /// llama.cpp-incompatible combinations before passing to a load call.
   const ModelParams({
     this.contextSize = 4096,
     this.gpuLayers = maxGpuLayers,
@@ -128,9 +158,40 @@ class ModelParams {
     this.batchSize = 0,
     this.microBatchSize = 0,
     this.maxParallelSequences = 1,
+    this.useMmap = true,
+    this.useMlock = false,
+    this.flashAttention = FlashAttention.auto,
+    this.cacheTypeK = KvCacheType.f16,
+    this.cacheTypeV = KvCacheType.f16,
+    this.kvUnified,
+    this.ropeFrequencyBase,
+    this.ropeFrequencyScale,
   });
 
+  /// Validates the parameter combination. Throws [ArgumentError] when the
+  /// combination is incompatible with llama.cpp (currently: non-F16 KV
+  /// cache requires flashAttention != disabled). Called automatically by
+  /// `LlamaCppService.loadModel` before the native call so callers don't
+  /// have to remember it; exposed publicly so callers who construct
+  /// `ModelParams` defensively can validate up-front.
+  void validate() {
+    if ((cacheTypeK != KvCacheType.f16 || cacheTypeV != KvCacheType.f16) &&
+        flashAttention == FlashAttention.disabled) {
+      throw ArgumentError(
+        'Non-F16 KV cache (cacheTypeK=$cacheTypeK, cacheTypeV=$cacheTypeV) '
+        'requires flashAttention != disabled. Either set flashAttention to '
+        'auto/enabled or use KvCacheType.f16 for both.',
+      );
+    }
+  }
+
   /// Creates a copy of this [ModelParams] with updated fields.
+  ///
+  /// Nullable fields ([chatTemplate], [kvUnified], [ropeFrequencyBase],
+  /// [ropeFrequencyScale]) use a sentinel pattern so callers can
+  /// **explicitly clear them back to null** by passing the corresponding
+  /// `clear*: true` flag. Without the sentinel, `null` would be
+  /// indistinguishable from "argument omitted, keep current value".
   ModelParams copyWith({
     int? contextSize,
     int? gpuLayers,
@@ -139,11 +200,23 @@ class ModelParams {
     int? mainGpu,
     List<LoraAdapterConfig>? loras,
     String? chatTemplate,
+    bool clearChatTemplate = false,
     int? numberOfThreads,
     int? numberOfThreadsBatch,
     int? batchSize,
     int? microBatchSize,
     int? maxParallelSequences,
+    bool? useMmap,
+    bool? useMlock,
+    FlashAttention? flashAttention,
+    KvCacheType? cacheTypeK,
+    KvCacheType? cacheTypeV,
+    bool? kvUnified,
+    bool clearKvUnified = false,
+    double? ropeFrequencyBase,
+    bool clearRopeFrequencyBase = false,
+    double? ropeFrequencyScale,
+    bool clearRopeFrequencyScale = false,
   }) {
     return ModelParams(
       contextSize: contextSize ?? this.contextSize,
@@ -152,12 +225,26 @@ class ModelParams {
       splitMode: splitMode ?? this.splitMode,
       mainGpu: mainGpu ?? this.mainGpu,
       loras: loras ?? this.loras,
-      chatTemplate: chatTemplate ?? this.chatTemplate,
+      chatTemplate: clearChatTemplate
+          ? null
+          : (chatTemplate ?? this.chatTemplate),
       numberOfThreads: numberOfThreads ?? this.numberOfThreads,
       numberOfThreadsBatch: numberOfThreadsBatch ?? this.numberOfThreadsBatch,
       batchSize: batchSize ?? this.batchSize,
       microBatchSize: microBatchSize ?? this.microBatchSize,
       maxParallelSequences: maxParallelSequences ?? this.maxParallelSequences,
+      useMmap: useMmap ?? this.useMmap,
+      useMlock: useMlock ?? this.useMlock,
+      flashAttention: flashAttention ?? this.flashAttention,
+      cacheTypeK: cacheTypeK ?? this.cacheTypeK,
+      cacheTypeV: cacheTypeV ?? this.cacheTypeV,
+      kvUnified: clearKvUnified ? null : (kvUnified ?? this.kvUnified),
+      ropeFrequencyBase: clearRopeFrequencyBase
+          ? null
+          : (ropeFrequencyBase ?? this.ropeFrequencyBase),
+      ropeFrequencyScale: clearRopeFrequencyScale
+          ? null
+          : (ropeFrequencyScale ?? this.ropeFrequencyScale),
     );
   }
 }