Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).

## [Unreleased]

### Changed
- **Server parallel context sizing:** `--ctx-size` is now treated as a total context budget shared across active request slots, matching llama.cpp server semantics. `--parallel N --ctx-size C` yields an effective per-slot window of `floor(C / N)`; explicit `--max-batch-size` values share the same budget, `--no-batch` keeps a single full-context slot, `/slots` reports the per-slot window, startup rejects per-slot windows below 512 tokens, and memory preflight uses the same sizing model (#57).

## [v0.0.29] - 2026-05-20

### Added
Expand Down
16 changes: 16 additions & 0 deletions docs/environment-variables.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,22 @@ and cached on first use. Set them before starting `mlxcel` or `mlxcel-server`.
| `MLXCEL_SERVER_DECODE_STORAGE` | `auto`, `dense`, `paged` | `auto` | Server continuous-batching decode storage. `--decode-storage-backend` takes precedence. Invalid values warn and fall back to `auto`. |
| `MLXCEL_SURGERY` | YAML file path | unset | Feature-gated weight-load surgery configuration. `--surgery` takes precedence when the `surgery` feature is built. |

## Server context sizing

`mlxcel serve` and `mlxcel-server` follow llama.cpp server semantics for the
llama-compatible flags `--ctx-size` / `LLAMA_ARG_CTX_SIZE` and `--parallel` /
`LLAMA_ARG_N_PARALLEL`: an explicit `--ctx-size C` is a total context budget
shared by the active request slots, so each slot receives `floor(C / N)` tokens
when `--parallel N` is used. If `--max-batch-size M` is set, `M` is the divisor
because it controls the maximum number of concurrent decode sequences. With
`--no-batch`, the divisor is `1`.

Startup fails when the effective per-slot context window is below 512 tokens.
The `/slots` endpoint and `/health.context_size` report the effective per-slot
window, not the total `--ctx-size` budget. The `--estimate-memory` preflight uses
the same per-slot window and active-sequence count so increasing `--parallel`
does not multiply KV memory for a fixed explicit `--ctx-size`.

## Build-time variables

These are read by the `mlxcel-core` build script.
Expand Down
6 changes: 3 additions & 3 deletions src/bin/mlx_server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ struct ServerArgs {
#[arg(long, env = "LLAMA_ARG_PORT", default_value_t = 8080)]
port: u16,

/// Context size limit (0 = use model default)
/// Total context budget shared across parallel slots (0 = use model default)
#[arg(
short = 'c',
long = "ctx-size",
Expand All @@ -182,7 +182,7 @@ struct ServerArgs {
)]
predict: i32,

/// Number of parallel request slots
/// Number of parallel request slots that share --ctx-size
#[arg(long = "parallel", env = "LLAMA_ARG_N_PARALLEL", default_value_t = 1)]
parallel: usize,

Expand Down Expand Up @@ -215,7 +215,7 @@ struct ServerArgs {
)]
draft: usize,

/// Maximum number of concurrent decode sequences (default: --parallel value)
/// Maximum concurrent decode sequences; explicit value shares --ctx-size
#[arg(long = "max-batch-size", value_name = "N")]
max_batch_size: Option<usize>,

Expand Down
16 changes: 12 additions & 4 deletions src/commands/serve.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ use mlxcel::server::{
env_fallback_lang_bias, env_fallback_lang_bias_include_byte_fragments,
env_fallback_prompt_cache_capacity_bytes, env_fallback_prompt_cache_enabled,
env_fallback_prompt_cache_max_entries, env_fallback_prompt_cache_min_prefix,
env_fallback_prompt_cache_ttl, env_fallback_reasoning_budget, start_server,
env_fallback_prompt_cache_ttl, env_fallback_reasoning_budget, resolve_parallel_context_size,
start_server,
};
use mlxcel_core::cache::KVCacheMode;

Expand Down Expand Up @@ -86,7 +87,7 @@ fn run_serve_memory_preflight(args: &crate::ServeArgs) -> anyhow::Result<()> {
return Err(anyhow::anyhow!(
"--estimate-memory: total {} exceeds available {} by {}. \
Pass --force (or --no-memory-check) to override, or rerun with \
a smaller --ctx-size / a smaller model.",
a smaller --ctx-size, smaller --max-batch-size, or a smaller model.",
format_bytes(estimate.total_bytes),
format_bytes(estimate.available_bytes),
format_bytes(estimate.overflow_bytes()),
Expand All @@ -100,9 +101,16 @@ fn run_serve_memory_preflight(args: &crate::ServeArgs) -> anyhow::Result<()> {
fn serve_preflight_ctx_len(args: &crate::ServeArgs) -> u64 {
// `--ctx-size 0` is the "use model default" sentinel; in that case we
// fall back to 8192 to match the historical sizing used by
// `--recommend-quant`. `--max-kv-size` caps the plain KV cache length.
// `--recommend-quant`. Explicit `--ctx-size` is a total budget shared by
// active slots, matching llama.cpp server semantics. `--max-kv-size`
// caps the plain KV cache length after the per-slot window is resolved.
let mut ctx_len = if args.ctx_size > 0 {
args.ctx_size as u64
resolve_parallel_context_size(
args.ctx_size,
args.n_parallel,
args.max_batch_size,
args.no_batch,
) as u64
} else {
mlxcel::memory_estimate::DEFAULT_CTX_LEN
};
Expand Down
18 changes: 18 additions & 0 deletions src/commands/serve_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -177,11 +177,29 @@ fn serve_preflight_ctx_len_uses_default_and_max_kv_cap() {
mlxcel::memory_estimate::DEFAULT_CTX_LEN
);

args.ctx_size = 8192;
assert_eq!(serve_preflight_ctx_len(&args), 2048);

args.ctx_size = 8192;
args.max_kv_size = 2048;
assert_eq!(serve_preflight_ctx_len(&args), 2048);
}

#[test]
fn serve_preflight_ctx_len_uses_parallel_context_semantics() {
let mut args = sample_args();
args.ctx_size = 4096;
args.max_batch_size = None;
args.n_parallel = 4;
assert_eq!(serve_preflight_ctx_len(&args), 1024);

args.max_batch_size = Some(2);
assert_eq!(serve_preflight_ctx_len(&args), 2048);

args.no_batch = true;
assert_eq!(serve_preflight_ctx_len(&args), 4096);
}

#[test]
fn build_startup_input_propagates_decode_storage_backend() {
let mut args = sample_args();
Expand Down
6 changes: 3 additions & 3 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -455,11 +455,11 @@ pub(crate) struct ServeArgs {
#[arg(long, value_name = "PATH")]
api_key_file: Option<PathBuf>,

/// Number of parallel request slots
/// Number of parallel request slots that share --ctx-size
#[arg(long, env = "LLAMA_ARG_N_PARALLEL", default_value_t = 1)]
n_parallel: usize,

/// Context size limit (0 = use model default)
/// Total context budget shared across parallel slots (0 = use model default)
#[arg(long, env = "LLAMA_ARG_CTX_SIZE", default_value_t = 0)]
ctx_size: usize,

Expand All @@ -475,7 +475,7 @@ pub(crate) struct ServeArgs {
#[arg(long, env = "LLAMA_ARG_DRAFT_MAX", default_value_t = 16)]
draft_max: usize,

/// Maximum number of concurrent decode sequences (default: --n-parallel value)
/// Maximum concurrent decode sequences; explicit value shares --ctx-size
#[arg(long, value_name = "N")]
max_batch_size: Option<usize>,

Expand Down
15 changes: 11 additions & 4 deletions src/server/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,12 @@ pub struct ServerConfig {
pub api_key: Option<String>,
pub timeout_seconds: u64,
pub model_alias: Option<String>,
/// Effective per-slot context window in tokens (`0` = model default).
///
/// Startup lowers `--ctx-size C --parallel N` to `C / N` for continuous
/// batching, matching llama.cpp server semantics. An explicit
/// `--max-batch-size` override becomes the divisor because it controls the
/// maximum number of concurrent decode sequences.
pub context_size: usize,
pub n_parallel: usize,
pub enable_slots_endpoint: bool,
Expand Down Expand Up @@ -384,12 +390,13 @@ pub struct ServerConfig {
/// both the legacy `kv_cache_mode` flag and the per-layer modes
/// resolved from `batch_kv_quant`.
///
/// Resolved from `--max-kv-size` CLI flag and `LLAMA_ARG_MAX_KV_SIZE`
/// env var, then validated by
/// Resolved from the effective per-slot `--ctx-size` and the
/// `--max-kv-size` CLI flag / `LLAMA_ARG_MAX_KV_SIZE` env var. The
/// explicit max-KV value is validated by
/// [`crate::server::cli_input::resolve_max_kv_size`] against the
/// accepted range (`0` = disabled, or
/// `[MAX_KV_SIZE_MIN, i32::MAX]`). `None` (the default) preserves
/// the legacy unbounded behaviour.
/// `[MAX_KV_SIZE_MIN, i32::MAX]`). If both are present, the lower value
/// wins so the configured context window remains an upper bound.
pub max_kv_size: Option<usize>,
}

Expand Down
5 changes: 4 additions & 1 deletion src/server/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,5 +72,8 @@ pub use prompt_cache::{
multimodal_digest_from_vecs,
};
pub use speculative_dispatch::{SpeculativeDispatch, SpeculativeDispatchError};
pub use startup::{ServerStartupConfig, start_server};
pub use startup::{
MIN_PARALLEL_CONTEXT_SIZE, ServerStartupConfig, effective_parallel_context_slots,
resolve_parallel_context_size, start_server,
};
pub use state::{AppState, BatchMetrics, Metrics, ModelMediaSupport};
2 changes: 1 addition & 1 deletion src/server/routes/health.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ use crate::server::types::{BatchStatusInfo, HealthResponse};
/// Build model-level health fields once the model is confirmed loaded.
///
/// Returns `(context_size, tool_call_parser)`:
/// - `context_size`: the configured `--ctx-size` value (0 = model default).
/// - `context_size`: the effective per-slot context window (0 = model default).
/// - `tool_call_parser`: `Some("mlxcel")` when the chat template supports
/// tool calls; `None` when the template does not expose the `tools`
/// variable and tool-call parsing will therefore never activate.
Expand Down
3 changes: 3 additions & 0 deletions src/server/routes/slots.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ pub async fn slots(State(state): State<AppState>) -> Json<Vec<SlotInfo>> {
let active_count = state.batch_metrics.active_count();
let queue_depth = state.batch_metrics.queue_depth();
let model_id = state.display_model_id().to_string();
let context_size = state.config.context_size;

let mut slots: Vec<SlotInfo> = Vec::with_capacity(max_slots + queue_depth);

Expand All @@ -45,6 +46,7 @@ pub async fn slots(State(state): State<AppState>) -> Json<Vec<SlotInfo>> {
"idle".to_string()
},
model: model_id.clone(),
context_size,
is_processing: is_active,
prompt_tokens: None,
generated_tokens: None,
Expand All @@ -58,6 +60,7 @@ pub async fn slots(State(state): State<AppState>) -> Json<Vec<SlotInfo>> {
id: max_slots + i,
state: "queued".to_string(),
model: model_id.clone(),
context_size,
is_processing: false,
prompt_tokens: None,
generated_tokens: None,
Expand Down
106 changes: 100 additions & 6 deletions src/server/startup.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,14 @@ struct ResolvedDistributedStartup {
remote_stage_service: Option<RemoteStageServiceConfig>,
}

/// Minimum effective context window accepted for each request slot.
///
/// `llama-server` treats `--ctx-size` as a total context budget shared by
/// parallel slots. Below this floor, a process can start successfully but
/// become unusable for normal chat/completion traffic, so fail early with a
/// clear operator-facing error.
pub const MIN_PARALLEL_CONTEXT_SIZE: usize = 512;

/// Startup configuration for the server (shared between `mlxcel serve` and `mlxcel-server`).
#[derive(Debug)]
pub struct ServerStartupConfig {
Expand Down Expand Up @@ -433,6 +441,82 @@ impl Default for ServerStartupConfig {
}
}

/// Return the number of slots that share the total context budget.
///
/// Continuous batching can admit `--max-batch-size` concurrent decode
/// sequences, so an explicit override becomes the sizing divisor. The legacy
/// sequential worker processes one request at a time and therefore keeps the
/// full context budget for that single active slot.
pub fn effective_parallel_context_slots(
n_parallel: usize,
max_batch_size: Option<usize>,
no_batch: bool,
) -> usize {
if no_batch {
1
} else {
max_batch_size.unwrap_or(n_parallel).max(1)
}
}

/// Resolve the effective per-slot context window from a total context budget.
pub fn resolve_parallel_context_size(
ctx_size: usize,
n_parallel: usize,
max_batch_size: Option<usize>,
no_batch: bool,
) -> usize {
if ctx_size == 0 {
return 0;
}

let slots = effective_parallel_context_slots(n_parallel, max_batch_size, no_batch);
ctx_size / slots
}

fn resolve_context_kv_cap(
per_slot_context_size: usize,
explicit_max_kv_size: Option<usize>,
) -> Option<usize> {
if per_slot_context_size == 0 {
return explicit_max_kv_size;
}

Some(match explicit_max_kv_size {
Some(max_kv_size) => max_kv_size.min(per_slot_context_size),
None => per_slot_context_size,
})
}

fn validate_parallel_context_startup(startup: &ServerStartupConfig) -> Result<()> {
if startup.ctx_size == 0 {
return Ok(());
}

let slots = effective_parallel_context_slots(
startup.n_parallel,
startup.max_batch_size,
startup.no_batch,
);
let per_slot_context_size = resolve_parallel_context_size(
startup.ctx_size,
startup.n_parallel,
startup.max_batch_size,
startup.no_batch,
);

anyhow::ensure!(
per_slot_context_size >= MIN_PARALLEL_CONTEXT_SIZE,
"--ctx-size {} divided across {} active slot(s) gives {} tokens per slot, below the minimum supported per-slot context size of {}; increase --ctx-size, reduce --parallel/--max-batch-size, or use --no-batch for single-slot serving",
startup.ctx_size,
slots,
per_slot_context_size,
MIN_PARALLEL_CONTEXT_SIZE
);

Ok(())
}

/// Resolve the elastic repartition configuration from CLI flags.
///
/// Returns `None` when `--enable-elastic-pp` is not set, which is the
Expand Down Expand Up @@ -638,12 +722,20 @@ pub(super) fn build_server_config(
&startup.tp_lm_head_mode,
)
.expect("tensor parallel config was already validated during startup");
let max_batch_size = startup.max_batch_size.unwrap_or(startup.n_parallel).max(1);
let context_size = resolve_parallel_context_size(
startup.ctx_size,
startup.n_parallel,
startup.max_batch_size,
startup.no_batch,
);
let max_kv_size = resolve_context_kv_cap(context_size, startup.max_kv_size);

ServerConfig {
api_key,
timeout_seconds: startup.timeout,
model_alias: startup.model_alias.clone(),
context_size: startup.ctx_size,
context_size,
n_parallel: startup.n_parallel,
enable_slots_endpoint: startup.enable_slots,
enable_props_endpoint: startup.enable_props,
Expand Down Expand Up @@ -672,7 +764,7 @@ pub(super) fn build_server_config(
// the resolved kind are known.
draft_kind: startup.draft_kind.clone(),
draft_block_size: startup.draft_block_size,
max_batch_size: startup.max_batch_size.unwrap_or(startup.n_parallel).max(1),
max_batch_size,
max_queue_depth: startup.max_queue_depth,
prefill_chunk_size: startup.prefill_chunk_size,
enable_preemption: startup.enable_preemption,
Expand Down Expand Up @@ -704,10 +796,11 @@ pub(super) fn build_server_config(
// the continuous-batching scheduler can apply per-layer modes
// (with the last-layer skip) at sequence allocation time.
batch_kv_quant: startup.batch_kv_quant,
// Issue #603: forward the resolved `--max-kv-size` so the scheduler
// can apply a head-trim policy to plain `KVCache` instances. `None`
// disables the cap and preserves the legacy unbounded behaviour.
max_kv_size: startup.max_kv_size,
// Issue #57/#603: forward the resolved per-slot context cap (optionally
// tightened by `--max-kv-size`) so the scheduler can apply a head-trim
// policy to plain `KVCache` instances. `None` means no explicit
// context or max-KV bound was configured.
max_kv_size,
}
}

Expand Down Expand Up @@ -1297,6 +1390,7 @@ pub async fn start_server(mut startup: ServerStartupConfig) -> Result<()> {
anyhow::bail!("--dry-run was requested but --pp-auto was not provided; nothing to plan");
}

validate_parallel_context_startup(&startup)?;
validate_pipeline_parallel_startup(&startup)?;
let tp_support = resolve_tensor_parallel_runtime_support(&startup)?;

Expand Down
Loading