Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,31 @@
* **Native runtime sync**:
* Updated native hook pinning to `leehack/llamadart-native@b9016`,
picking up the CUDA 12.8 Blackwell-capable native bundles.
* **Load-time tuning knobs**:
* Added `ModelParams.useMmap` (default `true`) and
`ModelParams.useMlock` (default `false`), wired to
`llama_model_params.use_mmap` / `use_mlock`. Lets callers turn off mmap
for platforms where memory-mapped weights hurt throughput, or pin
weights in RAM to avoid first-token paging spikes.
* Added `ModelParams.flashAttention` with the `FlashAttention.{auto,
enabled, disabled}` enum, wired to
`llama_context_params.flash_attn_type`. Explicit settings win over the
existing automatic Android/Vulkan heuristics; `auto` preserves prior
behavior.
* Added `ModelParams.cacheTypeK` and `ModelParams.cacheTypeV` with the
`KvCacheType.{f16, q8_0, q4_0}` enum, wired to
`llama_context_params.type_k` / `type_v`. Enables KV-cache
quantization (Q8_0 ≈ halves KV memory; Q4_0 ≈ quarters it). When the
user requests a non-F16 KV type with `flashAttention: auto`, the
service auto-promotes flash attention to enabled — llama.cpp requires
it for KV quantization.
* Added `ModelParams.kvUnified` (nullable) for explicit override of
`llama_context_params.kv_unified`. `null` keeps the existing
auto-enable-when-multi-sequence behavior.
* Added `ModelParams.ropeFrequencyBase` and
`ModelParams.ropeFrequencyScale` (both nullable) for
context-extension overrides on `llama_context_params.rope_freq_base` /
`rope_freq_scale`. `null` keeps the model's trained values.
* **GPU device selection API**:
* Added `ModelParams.mainGpu` and wired it to llama.cpp
`llama_model_params.main_gpu`.
Expand Down
2 changes: 2 additions & 0 deletions lib/llamadart.dart
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ export 'src/core/models/tools/tool_params.dart';
export 'src/core/llama_logger.dart';
export 'src/core/models/config/log_level.dart';
export 'src/core/models/config/gpu_backend.dart';
export 'src/core/models/config/flash_attention.dart';
export 'src/core/models/config/kv_cache_type.dart';
export 'src/core/models/config/lora_config.dart';

// Utils
Expand Down
13 changes: 12 additions & 1 deletion lib/src/backends/llama_cpp/llama_cpp_service.dart
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@ import 'dart:math' as math;
import 'package:ffi/ffi.dart';
import 'package:path/path.dart' as path;

import '../../core/llama_logger.dart';
import '../../core/models/chat/content_part.dart';
import '../../core/models/config/gpu_backend.dart';
import '../../core/models/config/log_level.dart';
import '../../core/models/inference/generation_params.dart';
import '../../core/models/inference/model_params.dart';
import 'load_param_helpers.dart';
import 'bindings.dart';

typedef _GgmlBackendLoadNative = ggml_backend_reg_t Function(Pointer<Char>);
Expand Down Expand Up @@ -1267,7 +1269,7 @@ class LlamaCppService {
mparams.n_gpu_layers = gpuLayers;
mparams.split_modeAsInt = modelParams.splitMode.llamaCppValue;
mparams.main_gpu = modelParams.mainGpu;
mparams.use_mmap = true;
applyModelParams(mparams, modelParams);
if (preferredDevices != null) {
mparams.devices = preferredDevices;
}
Expand Down Expand Up @@ -2524,6 +2526,15 @@ class LlamaCppService {
}
}

params.validate();
final resolvedFlashAttn = applyContextParams(ctxParams, params);
if (resolvedFlashAttn != params.flashAttention) {
LlamaLogger.instance.debug(
'llama_cpp_service: promoting flash_attn=enabled for non-F16 KV '
'(k=${params.cacheTypeK}, v=${params.cacheTypeV})',
);
}

final ctxPtr = llama_init_from_model(model.pointer, ctxParams);
if (ctxPtr == nullptr) {
throw Exception("Failed to create context");
Expand Down
89 changes: 89 additions & 0 deletions lib/src/backends/llama_cpp/load_param_helpers.dart
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
// Pure helpers for the load path. Kept here (vs inlined in the service) so
// they can be unit-tested without going through `LlamaEngine.loadModel`,
// which is integration-level and needs a real model file.

import '../../core/models/config/flash_attention.dart';
import '../../core/models/config/kv_cache_type.dart';
import '../../core/models/inference/model_params.dart';
import 'bindings.dart';

/// Maps llamadart's [KvCacheType] enum to llama.cpp's `ggml_type`. Pure
/// switch, no side effects.
ggml_type ggmlTypeFor(KvCacheType type) {
switch (type) {
case KvCacheType.f16:
return ggml_type.GGML_TYPE_F16;
case KvCacheType.q8_0:
return ggml_type.GGML_TYPE_Q8_0;
case KvCacheType.q4_0:
return ggml_type.GGML_TYPE_Q4_0;
}
}

/// Resolves the user-requested [FlashAttention] given the requested KV
/// cache types. llama.cpp refuses non-F16 KV without flash attention, so
/// `auto` is auto-promoted to `enabled` when either KV type isn't F16.
/// Explicit `enabled` / `disabled` are passed through unchanged.
///
/// Pairing this with [ModelParams]'s constructor-side ArgumentError on
/// `(non-F16 KV, FA disabled)` ensures the only ambiguous case (`auto`)
/// gets resolved deterministically here.
FlashAttention resolveFlashAttention({
required FlashAttention requested,
required KvCacheType cacheTypeK,
required KvCacheType cacheTypeV,
}) {
final wantsKvQuantization =
cacheTypeK != KvCacheType.f16 || cacheTypeV != KvCacheType.f16;
if (requested == FlashAttention.auto && wantsKvQuantization) {
return FlashAttention.enabled;
}
return requested;
}

/// Applies the user-controlled fields of [params] to a freshly-defaulted
/// `llama_model_params` struct. Pure function: caller is responsible for
/// initialising and freeing the struct.
void applyModelParams(llama_model_params mparams, ModelParams params) {
mparams.use_mmap = params.useMmap;
mparams.use_mlock = params.useMlock;
}

/// Applies the user-controlled fields of [params] to a `llama_context_params`
/// struct. Honours the `auto` → `enabled` flash-attention promotion via
/// [resolveFlashAttention]. Returns the resolved [FlashAttention] so the
/// caller can log whether a promotion occurred.
FlashAttention applyContextParams(
llama_context_params ctxParams,
ModelParams params,
) {
final resolvedFlashAttn = resolveFlashAttention(
requested: params.flashAttention,
cacheTypeK: params.cacheTypeK,
cacheTypeV: params.cacheTypeV,
);
switch (resolvedFlashAttn) {
case FlashAttention.auto:
break;
case FlashAttention.enabled:
ctxParams.flash_attn_typeAsInt =
llama_flash_attn_type.LLAMA_FLASH_ATTN_TYPE_ENABLED.value;
break;
case FlashAttention.disabled:
ctxParams.flash_attn_typeAsInt =
llama_flash_attn_type.LLAMA_FLASH_ATTN_TYPE_DISABLED.value;
break;
}
ctxParams.type_kAsInt = ggmlTypeFor(params.cacheTypeK).value;
ctxParams.type_vAsInt = ggmlTypeFor(params.cacheTypeV).value;
if (params.kvUnified != null) {
ctxParams.kv_unified = params.kvUnified!;
}
if (params.ropeFrequencyBase != null) {
ctxParams.rope_freq_base = params.ropeFrequencyBase!;
}
if (params.ropeFrequencyScale != null) {
ctxParams.rope_freq_scale = params.ropeFrequencyScale!;
}
return resolvedFlashAttn;
}
12 changes: 12 additions & 0 deletions lib/src/core/models/config/flash_attention.dart
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
/// Selects llama.cpp's `flash_attn_type`. Required when [KvCacheType] is
/// not [KvCacheType.f16] — llama.cpp refuses non-F16 KV cache without it.
enum FlashAttention {
/// Let llama.cpp pick.
auto,

/// Force on.
enabled,

/// Force off.
disabled,
}
13 changes: 13 additions & 0 deletions lib/src/core/models/config/kv_cache_type.dart
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
/// KV-cache data type for `llama_context_params.type_k` / `type_v`.
/// q8_0 ≈ 0.5× the KV memory of f16; q4_0 ≈ 0.25×. Both require flash
/// attention to be enabled (see [FlashAttention]).
enum KvCacheType {
/// fp16 (default).
f16,

/// 8-bit quantized.
q8_0,

/// 4-bit quantized.
q4_0,
}
93 changes: 90 additions & 3 deletions lib/src/core/models/inference/model_params.dart
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import '../config/flash_attention.dart';
import '../config/gpu_backend.dart';

import '../config/kv_cache_type.dart';
import '../config/lora_config.dart';

/// Strategy for distributing model tensors across GPU devices.
Expand Down Expand Up @@ -111,10 +112,39 @@ class ModelParams {
/// Set to 1 to preserve single-sequence behavior.
final int maxParallelSequences;

/// `llama_model_params.use_mmap`. Default `true`.
final bool useMmap;

/// `llama_model_params.use_mlock`. Default `false`.
final bool useMlock;

/// `llama_context_params.flash_attn_type`. User-explicit values override
/// the platform/backend heuristic.
final FlashAttention flashAttention;

/// `llama_context_params.type_k`. Non-F16 requires [flashAttention] enabled.
final KvCacheType cacheTypeK;

/// `llama_context_params.type_v`. Non-F16 requires [flashAttention] enabled.
final KvCacheType cacheTypeV;

/// `llama_context_params.kv_unified`. `null` keeps the current heuristic
/// (auto-enabled when [maxParallelSequences] > 1).
final bool? kvUnified;

/// `llama_context_params.rope_freq_base`. `null` keeps the model's
/// trained value.
final double? ropeFrequencyBase;

/// `llama_context_params.rope_freq_scale`. `null` keeps the model's
/// trained value.
final double? ropeFrequencyScale;

/// Maximum number of GPU layers to safely offload all layers.
static const int maxGpuLayers = 999;

/// Creates configuration for the model.
/// Creates configuration for the model. Use [validate] to check for
/// llama.cpp-incompatible combinations before passing to a load call.
const ModelParams({
this.contextSize = 4096,
this.gpuLayers = maxGpuLayers,
Expand All @@ -128,9 +158,40 @@ class ModelParams {
this.batchSize = 0,
this.microBatchSize = 0,
this.maxParallelSequences = 1,
this.useMmap = true,
this.useMlock = false,
this.flashAttention = FlashAttention.auto,
this.cacheTypeK = KvCacheType.f16,
this.cacheTypeV = KvCacheType.f16,
this.kvUnified,
this.ropeFrequencyBase,
this.ropeFrequencyScale,
});

/// Validates the parameter combination. Throws [ArgumentError] when the
/// combination is incompatible with llama.cpp (currently: non-F16 KV
/// cache requires flashAttention != disabled). Called automatically by
/// `LlamaCppService.loadModel` before the native call so callers don't
/// have to remember it; exposed publicly so callers who construct
/// `ModelParams` defensively can validate up-front.
void validate() {
if ((cacheTypeK != KvCacheType.f16 || cacheTypeV != KvCacheType.f16) &&
flashAttention == FlashAttention.disabled) {
throw ArgumentError(
'Non-F16 KV cache (cacheTypeK=$cacheTypeK, cacheTypeV=$cacheTypeV) '
'requires flashAttention != disabled. Either set flashAttention to '
'auto/enabled or use KvCacheType.f16 for both.',
);
}
}

/// Creates a copy of this [ModelParams] with updated fields.
///
/// Nullable fields ([chatTemplate], [kvUnified], [ropeFrequencyBase],
/// [ropeFrequencyScale]) use a sentinel pattern so callers can
/// **explicitly clear them back to null** by passing the corresponding
/// `clear*: true` flag. Without the sentinel, `null` would be
/// indistinguishable from "argument omitted, keep current value".
ModelParams copyWith({
int? contextSize,
int? gpuLayers,
Expand All @@ -139,11 +200,23 @@ class ModelParams {
int? mainGpu,
List<LoraAdapterConfig>? loras,
String? chatTemplate,
bool clearChatTemplate = false,
int? numberOfThreads,
int? numberOfThreadsBatch,
int? batchSize,
int? microBatchSize,
int? maxParallelSequences,
bool? useMmap,
bool? useMlock,
FlashAttention? flashAttention,
KvCacheType? cacheTypeK,
KvCacheType? cacheTypeV,
bool? kvUnified,
bool clearKvUnified = false,
double? ropeFrequencyBase,
bool clearRopeFrequencyBase = false,
double? ropeFrequencyScale,
bool clearRopeFrequencyScale = false,
}) {
return ModelParams(
contextSize: contextSize ?? this.contextSize,
Expand All @@ -152,12 +225,26 @@ class ModelParams {
splitMode: splitMode ?? this.splitMode,
mainGpu: mainGpu ?? this.mainGpu,
loras: loras ?? this.loras,
chatTemplate: chatTemplate ?? this.chatTemplate,
chatTemplate: clearChatTemplate
? null
: (chatTemplate ?? this.chatTemplate),
numberOfThreads: numberOfThreads ?? this.numberOfThreads,
numberOfThreadsBatch: numberOfThreadsBatch ?? this.numberOfThreadsBatch,
batchSize: batchSize ?? this.batchSize,
microBatchSize: microBatchSize ?? this.microBatchSize,
maxParallelSequences: maxParallelSequences ?? this.maxParallelSequences,
useMmap: useMmap ?? this.useMmap,
useMlock: useMlock ?? this.useMlock,
flashAttention: flashAttention ?? this.flashAttention,
cacheTypeK: cacheTypeK ?? this.cacheTypeK,
cacheTypeV: cacheTypeV ?? this.cacheTypeV,
kvUnified: clearKvUnified ? null : (kvUnified ?? this.kvUnified),
ropeFrequencyBase: clearRopeFrequencyBase
? null
: (ropeFrequencyBase ?? this.ropeFrequencyBase),
ropeFrequencyScale: clearRopeFrequencyScale
? null
: (ropeFrequencyScale ?? this.ropeFrequencyScale),
);
}
}
Loading
Loading