lablup · inureyes · May 21, 2026 · May 21, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
 
 ## [Unreleased]
 
+### Changed
+- **Server parallel context sizing:** `--ctx-size` is now treated as a total context budget shared across active request slots, matching llama.cpp server semantics. `--parallel N --ctx-size C` yields an effective per-slot window of `floor(C / N)`; explicit `--max-batch-size` values share the same budget, `--no-batch` keeps a single full-context slot, `/slots` reports the per-slot window, startup rejects per-slot windows below 512 tokens, and memory preflight uses the same sizing model (#57).
+
 ## [v0.0.29] - 2026-05-20
 
 ### Added

diff --git a/docs/environment-variables.md b/docs/environment-variables.md
@@ -37,6 +37,22 @@ and cached on first use. Set them before starting `mlxcel` or `mlxcel-server`.
 | `MLXCEL_SERVER_DECODE_STORAGE` | `auto`, `dense`, `paged` | `auto` | Server continuous-batching decode storage. `--decode-storage-backend` takes precedence. Invalid values warn and fall back to `auto`. |
 | `MLXCEL_SURGERY` | YAML file path | unset | Feature-gated weight-load surgery configuration. `--surgery` takes precedence when the `surgery` feature is built. |
 
+## Server context sizing
+
+`mlxcel serve` and `mlxcel-server` follow llama.cpp server semantics for the
+llama-compatible flags `--ctx-size` / `LLAMA_ARG_CTX_SIZE` and `--parallel` /
+`LLAMA_ARG_N_PARALLEL`: an explicit `--ctx-size C` is a total context budget
+shared by the active request slots, so each slot receives `floor(C / N)` tokens
+when `--parallel N` is used. If `--max-batch-size M` is set, `M` is the divisor
+because it controls the maximum number of concurrent decode sequences. With
+`--no-batch`, the divisor is `1`.
+
+Startup fails when the effective per-slot context window is below 512 tokens.
+The `/slots` endpoint and `/health.context_size` report the effective per-slot
+window, not the total `--ctx-size` budget. The `--estimate-memory` preflight uses
+the same per-slot window and active-sequence count so increasing `--parallel`
+does not multiply KV memory for a fixed explicit `--ctx-size`.
+
 ## Build-time variables
 
 These are read by the `mlxcel-core` build script.

diff --git a/src/bin/mlx_server.rs b/src/bin/mlx_server.rs
@@ -163,7 +163,7 @@ struct ServerArgs {
     #[arg(long, env = "LLAMA_ARG_PORT", default_value_t = 8080)]
     port: u16,
 
-    /// Context size limit (0 = use model default)
+    /// Total context budget shared across parallel slots (0 = use model default)
     #[arg(
         short = 'c',
         long = "ctx-size",
@@ -182,7 +182,7 @@ struct ServerArgs {
     )]
     predict: i32,
 
-    /// Number of parallel request slots
+    /// Number of parallel request slots that share --ctx-size
     #[arg(long = "parallel", env = "LLAMA_ARG_N_PARALLEL", default_value_t = 1)]
     parallel: usize,
 
@@ -215,7 +215,7 @@ struct ServerArgs {
     )]
     draft: usize,
 
-    /// Maximum number of concurrent decode sequences (default: --parallel value)
+    /// Maximum concurrent decode sequences; explicit value shares --ctx-size
     #[arg(long = "max-batch-size", value_name = "N")]
     max_batch_size: Option<usize>,
 

diff --git a/src/commands/serve.rs b/src/commands/serve.rs
@@ -30,7 +30,8 @@ use mlxcel::server::{
     env_fallback_lang_bias, env_fallback_lang_bias_include_byte_fragments,
     env_fallback_prompt_cache_capacity_bytes, env_fallback_prompt_cache_enabled,
     env_fallback_prompt_cache_max_entries, env_fallback_prompt_cache_min_prefix,
-    env_fallback_prompt_cache_ttl, env_fallback_reasoning_budget, start_server,
+    env_fallback_prompt_cache_ttl, env_fallback_reasoning_budget, resolve_parallel_context_size,
+    start_server,
 };
 use mlxcel_core::cache::KVCacheMode;
 
@@ -86,7 +87,7 @@ fn run_serve_memory_preflight(args: &crate::ServeArgs) -> anyhow::Result<()> {
             return Err(anyhow::anyhow!(
                 "--estimate-memory: total {} exceeds available {} by {}. \
                  Pass --force (or --no-memory-check) to override, or rerun with \
-                 a smaller --ctx-size / a smaller model.",
+                 a smaller --ctx-size, smaller --max-batch-size, or a smaller model.",
                 format_bytes(estimate.total_bytes),
                 format_bytes(estimate.available_bytes),
                 format_bytes(estimate.overflow_bytes()),
@@ -100,9 +101,16 @@ fn run_serve_memory_preflight(args: &crate::ServeArgs) -> anyhow::Result<()> {
 fn serve_preflight_ctx_len(args: &crate::ServeArgs) -> u64 {
     // `--ctx-size 0` is the "use model default" sentinel; in that case we
     // fall back to 8192 to match the historical sizing used by
-    // `--recommend-quant`. `--max-kv-size` caps the plain KV cache length.
+    // `--recommend-quant`. Explicit `--ctx-size` is a total budget shared by
+    // active slots, matching llama.cpp server semantics. `--max-kv-size`
+    // caps the plain KV cache length after the per-slot window is resolved.
     let mut ctx_len = if args.ctx_size > 0 {
-        args.ctx_size as u64
+        resolve_parallel_context_size(
+            args.ctx_size,
+            args.n_parallel,
+            args.max_batch_size,
+            args.no_batch,
+        ) as u64
     } else {
         mlxcel::memory_estimate::DEFAULT_CTX_LEN
     };

diff --git a/src/commands/serve_tests.rs b/src/commands/serve_tests.rs
@@ -177,11 +177,29 @@ fn serve_preflight_ctx_len_uses_default_and_max_kv_cap() {
         mlxcel::memory_estimate::DEFAULT_CTX_LEN
     );
 
+    args.ctx_size = 8192;
+    assert_eq!(serve_preflight_ctx_len(&args), 2048);
+
     args.ctx_size = 8192;
     args.max_kv_size = 2048;
     assert_eq!(serve_preflight_ctx_len(&args), 2048);
 }
 
+#[test]
+fn serve_preflight_ctx_len_uses_parallel_context_semantics() {
+    let mut args = sample_args();
+    args.ctx_size = 4096;
+    args.max_batch_size = None;
+    args.n_parallel = 4;
+    assert_eq!(serve_preflight_ctx_len(&args), 1024);
+
+    args.max_batch_size = Some(2);
+    assert_eq!(serve_preflight_ctx_len(&args), 2048);
+
+    args.no_batch = true;
+    assert_eq!(serve_preflight_ctx_len(&args), 4096);
+}
+
 #[test]
 fn build_startup_input_propagates_decode_storage_backend() {
     let mut args = sample_args();

diff --git a/src/main.rs b/src/main.rs
@@ -455,11 +455,11 @@ pub(crate) struct ServeArgs {
     #[arg(long, value_name = "PATH")]
     api_key_file: Option<PathBuf>,
 
-    /// Number of parallel request slots
+    /// Number of parallel request slots that share --ctx-size
     #[arg(long, env = "LLAMA_ARG_N_PARALLEL", default_value_t = 1)]
     n_parallel: usize,
 
-    /// Context size limit (0 = use model default)
+    /// Total context budget shared across parallel slots (0 = use model default)
     #[arg(long, env = "LLAMA_ARG_CTX_SIZE", default_value_t = 0)]
     ctx_size: usize,
 
@@ -475,7 +475,7 @@ pub(crate) struct ServeArgs {
     #[arg(long, env = "LLAMA_ARG_DRAFT_MAX", default_value_t = 16)]
     draft_max: usize,
 
-    /// Maximum number of concurrent decode sequences (default: --n-parallel value)
+    /// Maximum concurrent decode sequences; explicit value shares --ctx-size
     #[arg(long, value_name = "N")]
     max_batch_size: Option<usize>,
 

diff --git a/src/server/config.rs b/src/server/config.rs
@@ -221,6 +221,12 @@ pub struct ServerConfig {
     pub api_key: Option<String>,
     pub timeout_seconds: u64,
     pub model_alias: Option<String>,
+    /// Effective per-slot context window in tokens (`0` = model default).
+    ///
+    /// Startup lowers `--ctx-size C --parallel N` to `C / N` for continuous
+    /// batching, matching llama.cpp server semantics. An explicit
+    /// `--max-batch-size` override becomes the divisor because it controls the
+    /// maximum number of concurrent decode sequences.
     pub context_size: usize,
     pub n_parallel: usize,
     pub enable_slots_endpoint: bool,
@@ -384,12 +390,13 @@ pub struct ServerConfig {
     /// both the legacy `kv_cache_mode` flag and the per-layer modes
     /// resolved from `batch_kv_quant`.
     ///
-    /// Resolved from `--max-kv-size` CLI flag and `LLAMA_ARG_MAX_KV_SIZE`
-    /// env var, then validated by
+    /// Resolved from the effective per-slot `--ctx-size` and the
+    /// `--max-kv-size` CLI flag / `LLAMA_ARG_MAX_KV_SIZE` env var. The
+    /// explicit max-KV value is validated by
     /// [`crate::server::cli_input::resolve_max_kv_size`] against the
     /// accepted range (`0` = disabled, or
-    /// `[MAX_KV_SIZE_MIN, i32::MAX]`). `None` (the default) preserves
-    /// the legacy unbounded behaviour.
+    /// `[MAX_KV_SIZE_MIN, i32::MAX]`). If both are present, the lower value
+    /// wins so the configured context window remains an upper bound.
     pub max_kv_size: Option<usize>,
 }
 

diff --git a/src/server/mod.rs b/src/server/mod.rs
@@ -72,5 +72,8 @@ pub use prompt_cache::{
     multimodal_digest_from_vecs,
 };
 pub use speculative_dispatch::{SpeculativeDispatch, SpeculativeDispatchError};
-pub use startup::{ServerStartupConfig, start_server};
+pub use startup::{
+    MIN_PARALLEL_CONTEXT_SIZE, ServerStartupConfig, effective_parallel_context_slots,
+    resolve_parallel_context_size, start_server,
+};
 pub use state::{AppState, BatchMetrics, Metrics, ModelMediaSupport};
diff --git a/src/server/routes/health.rs b/src/server/routes/health.rs
@@ -25,7 +25,7 @@ use crate::server::types::{BatchStatusInfo, HealthResponse};
 /// Build model-level health fields once the model is confirmed loaded.
 ///
 /// Returns `(context_size, tool_call_parser)`:
-/// - `context_size`: the configured `--ctx-size` value (0 = model default).
+/// - `context_size`: the effective per-slot context window (0 = model default).
 /// - `tool_call_parser`: `Some("mlxcel")` when the chat template supports
 ///   tool calls; `None` when the template does not expose the `tools`
 ///   variable and tool-call parsing will therefore never activate.

diff --git a/src/server/routes/slots.rs b/src/server/routes/slots.rs
@@ -31,6 +31,7 @@ pub async fn slots(State(state): State<AppState>) -> Json<Vec<SlotInfo>> {
     let active_count = state.batch_metrics.active_count();
     let queue_depth = state.batch_metrics.queue_depth();
     let model_id = state.display_model_id().to_string();
+    let context_size = state.config.context_size;
 
     let mut slots: Vec<SlotInfo> = Vec::with_capacity(max_slots + queue_depth);
 
@@ -45,6 +46,7 @@ pub async fn slots(State(state): State<AppState>) -> Json<Vec<SlotInfo>> {
                 "idle".to_string()
             },
             model: model_id.clone(),
+            context_size,
             is_processing: is_active,
             prompt_tokens: None,
             generated_tokens: None,
@@ -58,6 +60,7 @@ pub async fn slots(State(state): State<AppState>) -> Json<Vec<SlotInfo>> {
             id: max_slots + i,
             state: "queued".to_string(),
             model: model_id.clone(),
+            context_size,
             is_processing: false,
             prompt_tokens: None,
             generated_tokens: None,

diff --git a/src/server/startup.rs b/src/server/startup.rs
@@ -49,6 +49,14 @@ struct ResolvedDistributedStartup {
     remote_stage_service: Option<RemoteStageServiceConfig>,
 }
 
+/// Minimum effective context window accepted for each request slot.
+///
+/// `llama-server` treats `--ctx-size` as a total context budget shared by
+/// parallel slots. Below this floor, a process can start successfully but
+/// become unusable for normal chat/completion traffic, so fail early with a
+/// clear operator-facing error.
+pub const MIN_PARALLEL_CONTEXT_SIZE: usize = 512;
+
 /// Startup configuration for the server (shared between `mlxcel serve` and `mlxcel-server`).
 #[derive(Debug)]
 pub struct ServerStartupConfig {
@@ -433,6 +441,82 @@ impl Default for ServerStartupConfig {
     }
 }
 
+/// Return the number of slots that share the total context budget.
+///
+/// Continuous batching can admit `--max-batch-size` concurrent decode
+/// sequences, so an explicit override becomes the sizing divisor. The legacy
+/// sequential worker processes one request at a time and therefore keeps the
+/// full context budget for that single active slot.
+pub fn effective_parallel_context_slots(
+    n_parallel: usize,
+    max_batch_size: Option<usize>,
+    no_batch: bool,
+) -> usize {
+    if no_batch {
+        1
+    } else {
+        max_batch_size.unwrap_or(n_parallel).max(1)
+    }
+}
+
+/// Resolve the effective per-slot context window from a total context budget.
+pub fn resolve_parallel_context_size(
+    ctx_size: usize,
+    n_parallel: usize,
+    max_batch_size: Option<usize>,
+    no_batch: bool,
+) -> usize {
+    if ctx_size == 0 {
+        return 0;
+    }
+
+    let slots = effective_parallel_context_slots(n_parallel, max_batch_size, no_batch);
+    ctx_size / slots
+}
+
+fn resolve_context_kv_cap(
+    per_slot_context_size: usize,
+    explicit_max_kv_size: Option<usize>,
+) -> Option<usize> {
+    if per_slot_context_size == 0 {
+        return explicit_max_kv_size;
+    }
+
+    Some(match explicit_max_kv_size {
+        Some(max_kv_size) => max_kv_size.min(per_slot_context_size),
+        None => per_slot_context_size,
+    })
+}
+
+fn validate_parallel_context_startup(startup: &ServerStartupConfig) -> Result<()> {
+    if startup.ctx_size == 0 {
+        return Ok(());
+    }
+
+    let slots = effective_parallel_context_slots(
+        startup.n_parallel,
+        startup.max_batch_size,
+        startup.no_batch,
+    );
+    let per_slot_context_size = resolve_parallel_context_size(
+        startup.ctx_size,
+        startup.n_parallel,
+        startup.max_batch_size,
+        startup.no_batch,
+    );
+
+    anyhow::ensure!(
+        per_slot_context_size >= MIN_PARALLEL_CONTEXT_SIZE,
+        "--ctx-size {} divided across {} active slot(s) gives {} tokens per slot, below the minimum supported per-slot context size of {}; increase --ctx-size, reduce --parallel/--max-batch-size, or use --no-batch for single-slot serving",
+        startup.ctx_size,
+        slots,
+        per_slot_context_size,
+        MIN_PARALLEL_CONTEXT_SIZE
+    );
+
+    Ok(())
+}
+
 /// Resolve the elastic repartition configuration from CLI flags.
 ///
 /// Returns `None` when `--enable-elastic-pp` is not set, which is the
@@ -638,12 +722,20 @@ pub(super) fn build_server_config(
         &startup.tp_lm_head_mode,
     )
     .expect("tensor parallel config was already validated during startup");
+    let max_batch_size = startup.max_batch_size.unwrap_or(startup.n_parallel).max(1);
+    let context_size = resolve_parallel_context_size(
+        startup.ctx_size,
+        startup.n_parallel,
+        startup.max_batch_size,
+        startup.no_batch,
+    );
+    let max_kv_size = resolve_context_kv_cap(context_size, startup.max_kv_size);
 
     ServerConfig {
         api_key,
         timeout_seconds: startup.timeout,
         model_alias: startup.model_alias.clone(),
-        context_size: startup.ctx_size,
+        context_size,
         n_parallel: startup.n_parallel,
         enable_slots_endpoint: startup.enable_slots,
         enable_props_endpoint: startup.enable_props,
@@ -672,7 +764,7 @@ pub(super) fn build_server_config(
         // the resolved kind are known.
         draft_kind: startup.draft_kind.clone(),
         draft_block_size: startup.draft_block_size,
-        max_batch_size: startup.max_batch_size.unwrap_or(startup.n_parallel).max(1),
+        max_batch_size,
         max_queue_depth: startup.max_queue_depth,
         prefill_chunk_size: startup.prefill_chunk_size,
         enable_preemption: startup.enable_preemption,
@@ -704,10 +796,11 @@ pub(super) fn build_server_config(
         // the continuous-batching scheduler can apply per-layer modes
         // (with the last-layer skip) at sequence allocation time.
         batch_kv_quant: startup.batch_kv_quant,
-        // Issue #603: forward the resolved `--max-kv-size` so the scheduler
-        // can apply a head-trim policy to plain `KVCache` instances. `None`
-        // disables the cap and preserves the legacy unbounded behaviour.
-        max_kv_size: startup.max_kv_size,
+        // Issue #57/#603: forward the resolved per-slot context cap (optionally
+        // tightened by `--max-kv-size`) so the scheduler can apply a head-trim
+        // policy to plain `KVCache` instances. `None` means no explicit
+        // context or max-KV bound was configured.
+        max_kv_size,
     }
 }
 
@@ -1297,6 +1390,7 @@ pub async fn start_server(mut startup: ServerStartupConfig) -> Result<()> {
         anyhow::bail!("--dry-run was requested but --pp-auto was not provided; nothing to plan");
     }
 
+    validate_parallel_context_startup(&startup)?;
     validate_pipeline_parallel_startup(&startup)?;
     let tp_support = resolve_tensor_parallel_runtime_support(&startup)?;