From 19d90186551bfd9f3c8942f13b10578981f3f674 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Tue, 26 May 2026 12:24:05 -0700 Subject: [PATCH 01/25] feat(hm-util): add platform-native COW clone + fuse-overlayfs backend --- Cargo.lock | 5 +- crates/hm-util/Cargo.toml | 4 + crates/hm-util/src/cow.rs | 327 ++++++++++++++++++++++++++++++++++++++ crates/hm-util/src/lib.rs | 1 + 4 files changed, 336 insertions(+), 1 deletion(-) create mode 100644 crates/hm-util/src/cow.rs diff --git a/Cargo.lock b/Cargo.lock index a372b05..b7f1902 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1211,7 +1211,7 @@ dependencies = [ [[package]] name = "hm-plugin-cloud" -version = "0.1.0" +version = "0.0.0-dev" dependencies = [ "anyhow", "base64", @@ -1248,9 +1248,12 @@ dependencies = [ name = "hm-util" version = "0.0.0-dev" dependencies = [ + "anyhow", "dirs", "tempfile", "tokio", + "tracing", + "which 6.0.3", "windows", ] diff --git a/crates/hm-util/Cargo.toml b/crates/hm-util/Cargo.toml index 9e10e8a..4eea739 100644 --- a/crates/hm-util/Cargo.toml +++ b/crates/hm-util/Cargo.toml @@ -7,8 +7,12 @@ repository.workspace = true description = "Shared OS and filesystem utilities for Harmont crates." [dependencies] +anyhow = { workspace = true } dirs = "6" +tempfile = "3" tokio = { version = "1", features = ["rt", "rt-multi-thread", "fs", "io-util"] } +tracing = { workspace = true } +which = "6" [target.'cfg(windows)'.dependencies.windows] version = "0.62" diff --git a/crates/hm-util/src/cow.rs b/crates/hm-util/src/cow.rs new file mode 100644 index 0000000..73ba301 --- /dev/null +++ b/crates/hm-util/src/cow.rs @@ -0,0 +1,327 @@ +//! Platform-native copy-on-write directory cloning. + +use std::path::{Path, PathBuf}; +use std::process::Command; + +use anyhow::{Context, Result, bail}; + +// ----------------------------------------------------------------------- +// Strategy detection +// ----------------------------------------------------------------------- + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum CowStrategy { + ApfsClone, + Reflink, + FuseOverlay, + FullCopy, +} + +/// Detect the best available COW strategy for the current platform. +#[must_use] +#[allow(clippy::missing_const_for_fn)] // linux branch calls runtime functions +pub fn detect_strategy() -> CowStrategy { + #[cfg(target_os = "macos")] + { + return CowStrategy::ApfsClone; + } + + #[cfg(target_os = "linux")] + { + if probe_reflink() { + return CowStrategy::Reflink; + } + if probe_fuse_overlayfs() { + return CowStrategy::FuseOverlay; + } + return CowStrategy::FullCopy; + } + + #[allow(unreachable_code)] + CowStrategy::FullCopy +} + +#[cfg(target_os = "linux")] +fn probe_reflink() -> bool { + let tmp = match tempfile::tempdir() { + Ok(t) => t, + Err(_) => return false, + }; + let src = tmp.path().join("src"); + let dst = tmp.path().join("dst"); + if std::fs::write(&src, b"x").is_err() { + return false; + } + Command::new("cp") + .args(["--reflink=always"]) + .arg(&src) + .arg(&dst) + .stderr(std::process::Stdio::null()) + .status() + .is_ok_and(|s| s.success()) +} + +#[cfg(target_os = "linux")] +fn probe_fuse_overlayfs() -> bool { + which::which("fuse-overlayfs").is_ok() +} + +// ----------------------------------------------------------------------- +// cow_clone_dir +// ----------------------------------------------------------------------- + +/// Clone `src` to `dst` using the best available COW mechanism. +/// +/// # Errors +/// +/// Returns an error if `dst` already exists, if parent directories cannot +/// be created, or if the underlying copy operation fails. +pub fn cow_clone_dir(src: &Path, dst: &Path) -> Result<()> { + if dst.exists() { + bail!("destination already exists: {}", dst.display()); + } + if let Some(parent) = dst.parent() { + std::fs::create_dir_all(parent) + .with_context(|| format!("create parent dirs for {}", dst.display()))?; + } + + if try_platform_cow(src, dst)? { + return Ok(()); + } + + copy_dir_recursive(src, dst) +} + +fn try_platform_cow(src: &Path, dst: &Path) -> Result { + #[cfg(target_os = "macos")] + { + let status = Command::new("cp") + .args(["-c", "-R", "-p"]) + .arg(src) + .arg(dst) + .stderr(std::process::Stdio::null()) + .status() + .context("spawn cp -c")?; + if status.success() { + return Ok(true); + } + let _ = std::fs::remove_dir_all(dst); + } + + #[cfg(target_os = "linux")] + { + let status = Command::new("cp") + .args(["--reflink=always", "-a"]) + .arg(src) + .arg(dst) + .stderr(std::process::Stdio::null()) + .status() + .context("spawn cp --reflink")?; + if status.success() { + return Ok(true); + } + let _ = std::fs::remove_dir_all(dst); + + let status = Command::new("cp") + .args(["-a"]) + .arg(src) + .arg(dst) + .stderr(std::process::Stdio::null()) + .status() + .context("spawn cp -a")?; + if status.success() { + return Ok(true); + } + let _ = std::fs::remove_dir_all(dst); + } + + Ok(false) +} + +fn copy_dir_recursive(src: &Path, dst: &Path) -> Result<()> { + std::fs::create_dir_all(dst) + .with_context(|| format!("create {}", dst.display()))?; + for entry in std::fs::read_dir(src) + .with_context(|| format!("read dir {}", src.display()))? + { + let entry = entry?; + let ty = entry.file_type()?; + let src_path = entry.path(); + let dst_path = dst.join(entry.file_name()); + if ty.is_dir() { + copy_dir_recursive(&src_path, &dst_path)?; + } else if ty.is_symlink() { + let target = std::fs::read_link(&src_path)?; + #[cfg(unix)] + std::os::unix::fs::symlink(&target, &dst_path)?; + #[cfg(windows)] + std::os::windows::fs::symlink_file(&target, &dst_path)?; + } else { + std::fs::copy(&src_path, &dst_path) + .with_context(|| format!("copy {}", src_path.display()))?; + } + } + Ok(()) +} + +// ----------------------------------------------------------------------- +// OverlayMount — fuse-overlayfs lifecycle (strategy 3) +// ----------------------------------------------------------------------- + +#[derive(Debug)] +pub struct OverlayMount { + merged: PathBuf, + upper: PathBuf, +} + +impl OverlayMount { + /// Mount a fuse-overlayfs filesystem merging the given layers. + /// + /// # Errors + /// + /// Returns an error if directory creation fails or `fuse-overlayfs` + /// exits with a non-zero status. + pub fn mount( + lower_dirs: &[&Path], + upper_dir: &Path, + work_dir: &Path, + merged_path: &Path, + ) -> Result { + std::fs::create_dir_all(upper_dir)?; + std::fs::create_dir_all(work_dir)?; + std::fs::create_dir_all(merged_path)?; + + let lowerdir: String = lower_dirs + .iter() + .map(|p| p.to_string_lossy().into_owned()) + .collect::>() + .join(":"); + + let opts = format!( + "lowerdir={lowerdir},upperdir={},workdir={}", + upper_dir.display(), + work_dir.display(), + ); + + let status = Command::new("fuse-overlayfs") + .args(["-o", &opts]) + .arg(merged_path) + .stderr(std::process::Stdio::piped()) + .status() + .context("spawn fuse-overlayfs")?; + + if !status.success() { + bail!( + "fuse-overlayfs mount failed (exit {}): lowerdir={}, upper={}, merged={}", + status.code().unwrap_or(-1), + lowerdir, + upper_dir.display(), + merged_path.display(), + ); + } + + Ok(Self { + merged: merged_path.to_path_buf(), + upper: upper_dir.to_path_buf(), + }) + } + + #[must_use] + pub fn merged_path(&self) -> &Path { + &self.merged + } + + #[must_use] + pub fn upper_dir(&self) -> &Path { + &self.upper + } + + /// Unmount the fuse-overlayfs filesystem. + /// + /// # Errors + /// + /// Returns an error if `fusermount` cannot be spawned or exits + /// with a non-zero status. + pub fn unmount(&self) -> Result<()> { + let bin = if which::which("fusermount3").is_ok() { + "fusermount3" + } else { + "fusermount" + }; + let status = Command::new(bin) + .args(["-u"]) + .arg(&self.merged) + .stderr(std::process::Stdio::null()) + .status() + .with_context(|| format!("spawn {bin} -u"))?; + if !status.success() { + bail!("{bin} -u {} failed", self.merged.display()); + } + Ok(()) + } +} + +impl Drop for OverlayMount { + fn drop(&mut self) { + if let Err(e) = self.unmount() { + tracing::warn!(%e, path = %self.merged.display(), "fuse-overlayfs unmount failed"); + } + } +} + +#[cfg(test)] +#[allow(clippy::unwrap_used)] +mod tests { + use super::*; + use std::fs; + + #[test] + fn cow_clone_creates_identical_tree() { + let tmp = tempfile::tempdir().unwrap(); + let src = tmp.path().join("src"); + fs::create_dir_all(src.join("sub")).unwrap(); + fs::write(src.join("a.txt"), b"hello").unwrap(); + fs::write(src.join("sub/b.txt"), b"world").unwrap(); + + let dst = tmp.path().join("dst"); + cow_clone_dir(&src, &dst).unwrap(); + + assert_eq!(fs::read_to_string(dst.join("a.txt")).unwrap(), "hello"); + assert_eq!(fs::read_to_string(dst.join("sub/b.txt")).unwrap(), "world"); + } + + #[test] + fn cow_clone_is_isolated() { + let tmp = tempfile::tempdir().unwrap(); + let src = tmp.path().join("src"); + fs::create_dir(&src).unwrap(); + fs::write(src.join("f.txt"), b"original").unwrap(); + + let dst = tmp.path().join("dst"); + cow_clone_dir(&src, &dst).unwrap(); + + // Mutate dst; src must be unchanged. + fs::write(dst.join("f.txt"), b"modified").unwrap(); + assert_eq!(fs::read_to_string(src.join("f.txt")).unwrap(), "original"); + assert_eq!(fs::read_to_string(dst.join("f.txt")).unwrap(), "modified"); + } + + #[test] + fn cow_clone_fails_if_dst_exists() { + let tmp = tempfile::tempdir().unwrap(); + let src = tmp.path().join("src"); + fs::create_dir(&src).unwrap(); + let dst = tmp.path().join("dst"); + fs::create_dir(&dst).unwrap(); + + assert!(cow_clone_dir(&src, &dst).is_err()); + } + + #[test] + fn detect_strategy_returns_something() { + // Should always detect at least FullCopy. + let s = detect_strategy(); + assert!(!matches!(s, CowStrategy::FuseOverlay)); + // Can't assert specific strategy (platform-dependent) but it must not panic. + } +} diff --git a/crates/hm-util/src/lib.rs b/crates/hm-util/src/lib.rs index c5284c5..e35ba64 100644 --- a/crates/hm-util/src/lib.rs +++ b/crates/hm-util/src/lib.rs @@ -1,2 +1,3 @@ +pub mod cow; pub mod dirs; pub mod os; From 8255473e2c63cb01d7f191170d385d3fbbc90aab Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Tue, 26 May 2026 12:28:37 -0700 Subject: [PATCH 02/25] =?UTF-8?q?fix(cow):=20address=20code=20review=20?= =?UTF-8?q?=E2=80=94=20cache=20strategy,=20fix=20deadlock,=20guard=20doubl?= =?UTF-8?q?e-unmount?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/hm-util/src/cow.rs | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/crates/hm-util/src/cow.rs b/crates/hm-util/src/cow.rs index 73ba301..ee798ec 100644 --- a/crates/hm-util/src/cow.rs +++ b/crates/hm-util/src/cow.rs @@ -2,6 +2,7 @@ use std::path::{Path, PathBuf}; use std::process::Command; +use std::sync::OnceLock; use anyhow::{Context, Result, bail}; @@ -18,9 +19,14 @@ pub enum CowStrategy { } /// Detect the best available COW strategy for the current platform. +/// Result is cached after the first call. #[must_use] -#[allow(clippy::missing_const_for_fn)] // linux branch calls runtime functions pub fn detect_strategy() -> CowStrategy { + static STRATEGY: OnceLock = OnceLock::new(); + *STRATEGY.get_or_init(detect_strategy_inner) +} + +fn detect_strategy_inner() -> CowStrategy { #[cfg(target_os = "macos")] { return CowStrategy::ApfsClone; @@ -168,10 +174,10 @@ fn copy_dir_recursive(src: &Path, dst: &Path) -> Result<()> { // OverlayMount — fuse-overlayfs lifecycle (strategy 3) // ----------------------------------------------------------------------- -#[derive(Debug)] pub struct OverlayMount { merged: PathBuf, upper: PathBuf, + mounted: std::sync::atomic::AtomicBool, } impl OverlayMount { @@ -203,17 +209,17 @@ impl OverlayMount { work_dir.display(), ); - let status = Command::new("fuse-overlayfs") + let output = Command::new("fuse-overlayfs") .args(["-o", &opts]) .arg(merged_path) - .stderr(std::process::Stdio::piped()) - .status() + .output() .context("spawn fuse-overlayfs")?; - if !status.success() { + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); bail!( - "fuse-overlayfs mount failed (exit {}): lowerdir={}, upper={}, merged={}", - status.code().unwrap_or(-1), + "fuse-overlayfs mount failed (exit {}): {stderr}\nlowerdir={}, upper={}, merged={}", + output.status.code().unwrap_or(-1), lowerdir, upper_dir.display(), merged_path.display(), @@ -223,6 +229,7 @@ impl OverlayMount { Ok(Self { merged: merged_path.to_path_buf(), upper: upper_dir.to_path_buf(), + mounted: std::sync::atomic::AtomicBool::new(true), }) } @@ -236,13 +243,16 @@ impl OverlayMount { &self.upper } - /// Unmount the fuse-overlayfs filesystem. + /// Unmount the fuse-overlayfs filesystem. Safe to call multiple times. /// /// # Errors /// /// Returns an error if `fusermount` cannot be spawned or exits /// with a non-zero status. pub fn unmount(&self) -> Result<()> { + if !self.mounted.swap(false, std::sync::atomic::Ordering::AcqRel) { + return Ok(()); + } let bin = if which::which("fusermount3").is_ok() { "fusermount3" } else { @@ -261,6 +271,15 @@ impl OverlayMount { } } +impl std::fmt::Debug for OverlayMount { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("OverlayMount") + .field("merged", &self.merged) + .field("upper", &self.upper) + .finish() + } +} + impl Drop for OverlayMount { fn drop(&mut self) { if let Err(e) = self.unmount() { From 9eb928c2433d3bd55a5c4f4d38816afc3ace5056 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Tue, 26 May 2026 12:32:11 -0700 Subject: [PATCH 03/25] feat(docker): add start_long_lived_with_mounts for bind mount support Add build_host_config helper and start_long_lived_with_mounts method to DockerClient so containers can mount host workspace directories via HostConfig bind mounts. This enables the COW workspace caching feature to bind-mount overlay filesystems into build containers. --- crates/hm/src/orchestrator/docker_client.rs | 84 +++++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/crates/hm/src/orchestrator/docker_client.rs b/crates/hm/src/orchestrator/docker_client.rs index 866ffb8..89210e2 100644 --- a/crates/hm/src/orchestrator/docker_client.rs +++ b/crates/hm/src/orchestrator/docker_client.rs @@ -14,6 +14,7 @@ use bollard::container::{ StopContainerOptions, }; use bollard::exec::{CreateExecOptions, StartExecResults}; +use bollard::models::HostConfig; use bollard::image::{ CommitContainerOptions, CreateImageOptions, ImportImageOptions, ListImagesOptions, RemoveImageOptions, @@ -23,6 +24,25 @@ use tokio::io::AsyncWrite; use crate::error::HmError; +/// Build a [`HostConfig`] with optional bind mounts and Linux capabilities. +/// +/// Empty slices become `None` so Docker applies its defaults. +fn build_host_config(binds: &[String], cap_add: &[String]) -> HostConfig { + HostConfig { + binds: if binds.is_empty() { + None + } else { + Some(binds.to_vec()) + }, + cap_add: if cap_add.is_empty() { + None + } else { + Some(cap_add.to_vec()) + }, + ..Default::default() + } +} + #[derive(Debug, Clone)] pub struct DockerClient { inner: Arc, @@ -161,6 +181,50 @@ impl DockerClient { Ok(create.id) } + /// Like [`Self::start_long_lived`] but with bind mounts via `HostConfig`. + /// + /// Each entry in `binds` is a Docker bind-mount string of the form + /// `"/host/path:/container/path"` (with an optional `:ro` suffix). + /// + /// # Errors + /// + /// Returns [`HmError::Docker`] if the container cannot be created + /// (image not pulled, name conflict, OCI runtime failure) or if + /// `start_container` rejects the create. + pub async fn start_long_lived_with_mounts( + &self, + image: &str, + env: &[String], + workdir: &str, + name: &str, + binds: &[String], + ) -> Result { + let cfg = Config { + image: Some(image.to_string()), + cmd: Some(vec!["sh".into(), "-c".into(), "sleep infinity".into()]), + env: Some(env.to_vec()), + working_dir: Some(workdir.to_string()), + host_config: Some(build_host_config(binds, &[])), + ..Default::default() + }; + let create = self + .inner + .create_container( + Some(CreateContainerOptions { + name, + ..Default::default() + }), + cfg, + ) + .await + .map_err(|e| HmError::Docker(format!("create_container: {e}")))?; + self.inner + .start_container(&create.id, None::>) + .await + .map_err(|e| HmError::Docker(format!("start_container: {e}")))?; + Ok(create.id) + } + /// Exec a command inside a running container and stream stdout+stderr /// to `out`. Returns the command's exit code. /// @@ -530,4 +594,24 @@ mod smoke { .unwrap(); assert!(tags.is_empty()); } + + #[test] + fn build_host_config_with_binds_and_no_caps() { + let hc = super::build_host_config( + &["/host/path:/container/path".to_string()], + &[], + ); + assert_eq!( + hc.binds.as_ref().unwrap(), + &["/host/path:/container/path".to_string()] + ); + assert!(hc.cap_add.is_none()); + } + + #[test] + fn build_host_config_empty_binds_is_none() { + let hc = super::build_host_config(&[], &[]); + assert!(hc.binds.is_none()); + assert!(hc.cap_add.is_none()); + } } From c88fd4196cff751c671e706147c369f064d49f00 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Tue, 26 May 2026 12:32:42 -0700 Subject: [PATCH 04/25] feat(orchestrator): add WorkspaceManager with COW cloning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduces WorkspaceManager that manages per-step workspace directories for a single pipeline run. Auto-selects between clone strategy (macOS APFS / Linux reflink / fallback cp) and overlay strategy (fuse-overlayfs) based on platform detection from hm-util::cow. The rest of the system only sees workspace_path() — strategy is transparent. --- crates/hm/src/orchestrator/mod.rs | 1 + crates/hm/src/orchestrator/workspace.rs | 358 ++++++++++++++++++++++++ 2 files changed, 359 insertions(+) create mode 100644 crates/hm/src/orchestrator/workspace.rs diff --git a/crates/hm/src/orchestrator/mod.rs b/crates/hm/src/orchestrator/mod.rs index 7c23600..f6fffcd 100644 --- a/crates/hm/src/orchestrator/mod.rs +++ b/crates/hm/src/orchestrator/mod.rs @@ -14,5 +14,6 @@ pub mod output_subscriber; pub mod scheduler; pub mod signal; pub mod source; +pub mod workspace; pub use scheduler::run; diff --git a/crates/hm/src/orchestrator/workspace.rs b/crates/hm/src/orchestrator/workspace.rs new file mode 100644 index 0000000..ada1439 --- /dev/null +++ b/crates/hm/src/orchestrator/workspace.rs @@ -0,0 +1,358 @@ +//! Per-run workspace orchestration. +//! +//! [`WorkspaceManager`] auto-selects between two strategies: +//! +//! - **Clone strategy** (macOS APFS, Linux reflink, fallback `cp`): +//! each step gets a full directory clone via [`hm_util::cow::cow_clone_dir`]. +//! - **Overlay strategy** (Linux ext4 + `fuse-overlayfs`): +//! each step gets a `fuse-overlayfs` mount with shared lower layers. +//! +//! The rest of the system (scheduler, runner) only sees +//! [`WorkspaceManager::workspace_path`] — strategy is transparent. + +use std::collections::HashMap; +use std::path::{Path, PathBuf}; + +use anyhow::{Context, Result}; +use hm_util::cow::{CowStrategy, OverlayMount}; + +/// Manages workspace directories for a single pipeline run. +/// +/// Each step gets an isolated directory that is either a full COW clone +/// or a `fuse-overlayfs` mount, depending on the platform. +pub struct WorkspaceManager { + run_dir: PathBuf, + base_dir: PathBuf, + strategy: CowStrategy, + workspaces: HashMap, + overlays: HashMap, +} + +struct OverlayLayer { + upper_dir: PathBuf, + merged_dir: PathBuf, + ancestor_uppers: Vec, + _mount: Option, +} + +impl WorkspaceManager { + /// Create a new workspace manager that clones from `base_dir` into + /// per-step sub-directories under `run_dir`. + /// + /// # Errors + /// + /// Returns an error if `run_dir` cannot be created. + pub fn from_base(run_dir: PathBuf, base_dir: PathBuf) -> Result { + std::fs::create_dir_all(&run_dir) + .with_context(|| format!("create run dir {}", run_dir.display()))?; + let strategy = hm_util::cow::detect_strategy(); + tracing::info!(?strategy, "COW workspace strategy"); + Ok(Self { + run_dir, + base_dir, + strategy, + workspaces: HashMap::new(), + overlays: HashMap::new(), + }) + } + + /// Create a new workspace manager that first extracts a tar.gz + /// archive into `run_dir/base`, then delegates to [`Self::from_base`]. + /// + /// # Errors + /// + /// Returns an error if the archive cannot be extracted or the run + /// directory cannot be created. + pub fn from_archive(run_dir: PathBuf, archive_bytes: &[u8]) -> Result { + let base_dir = run_dir.join("base"); + std::fs::create_dir_all(&base_dir) + .with_context(|| format!("create base dir {}", base_dir.display()))?; + extract_tar_gz(archive_bytes, &base_dir)?; + Self::from_base(run_dir, base_dir) + } + + /// Create an isolated workspace directory for `step_key`. + /// + /// If `parent_key` is `Some`, the workspace inherits the contents of + /// the parent workspace (including any modifications made after + /// creation). If `None`, the workspace is cloned from the base + /// directory. + /// + /// # Errors + /// + /// Returns an error if the parent workspace is not registered, or if + /// the clone / overlay operation fails. + pub fn create_workspace( + &mut self, + step_key: &str, + parent_key: Option<&str>, + ) -> Result { + match self.strategy { + CowStrategy::FuseOverlay => self.create_overlay(step_key, parent_key), + _ => self.create_clone(step_key, parent_key, None), + } + } + + /// Create a workspace from a cached directory, bypassing parent + /// relationships. + /// + /// # Errors + /// + /// Returns an error if the clone operation fails. + pub fn create_workspace_from_cache( + &mut self, + step_key: &str, + cached_workspace: &Path, + ) -> Result { + self.create_clone(step_key, None, Some(cached_workspace)) + } + + /// Look up the filesystem path for a previously created workspace. + #[must_use] + pub fn workspace_path(&self, step_key: &str) -> Option<&Path> { + if let Some(p) = self.workspaces.get(step_key) { + return Some(p.as_path()); + } + self.overlays + .get(step_key) + .map(|l| l.merged_dir.as_path()) + } + + /// The base directory that root workspaces are cloned from. + #[must_use] + pub fn base_dir(&self) -> &Path { + &self.base_dir + } + + /// The COW strategy in use for this run. + #[must_use] + pub const fn strategy(&self) -> CowStrategy { + self.strategy + } + + /// Remove the entire run directory, including all workspaces and + /// overlay mounts. + /// + /// # Errors + /// + /// Returns an error if the run directory cannot be removed. + pub fn cleanup(&mut self) -> Result<()> { + // Drop overlay mounts before removing the filesystem tree. + self.overlays.clear(); + if self.run_dir.exists() { + std::fs::remove_dir_all(&self.run_dir).with_context(|| { + format!("cleanup run dir {}", self.run_dir.display()) + })?; + } + Ok(()) + } + + // ------------------------------------------------------------------ + // Clone strategy + // ------------------------------------------------------------------ + + fn create_clone( + &mut self, + step_key: &str, + parent_key: Option<&str>, + cached: Option<&Path>, + ) -> Result { + let safe = sanitize_key(step_key); + let ws_dir = self.run_dir.join("workspaces").join(&safe); + + let source = if let Some(c) = cached { + c.to_path_buf() + } else if let Some(pk) = parent_key { + self.workspaces + .get(pk) + .cloned() + .ok_or_else(|| anyhow::anyhow!("parent workspace '{pk}' not registered"))? + } else { + self.base_dir.clone() + }; + + hm_util::cow::cow_clone_dir(&source, &ws_dir).with_context(|| { + format!("cow clone {} -> {}", source.display(), ws_dir.display()) + })?; + + self.workspaces.insert(step_key.to_string(), ws_dir.clone()); + Ok(ws_dir) + } + + // ------------------------------------------------------------------ + // Overlay strategy + // ------------------------------------------------------------------ + + fn create_overlay( + &mut self, + step_key: &str, + parent_key: Option<&str>, + ) -> Result { + let safe = sanitize_key(step_key); + let layer_dir = self.run_dir.join("layers").join(&safe); + let upper_dir = layer_dir.join("upper"); + let work_dir = layer_dir.join("work"); + let merged_dir = layer_dir.join("merged"); + + std::fs::create_dir_all(&upper_dir)?; + std::fs::create_dir_all(&work_dir)?; + std::fs::create_dir_all(&merged_dir)?; + + let ancestor_uppers = if let Some(pk) = parent_key { + let parent = self.overlays.get(pk).ok_or_else(|| { + anyhow::anyhow!("parent overlay '{pk}' not registered") + })?; + let mut ancestors = vec![parent.upper_dir.clone()]; + ancestors.extend(parent.ancestor_uppers.iter().cloned()); + ancestors + } else { + vec![] + }; + + let mut lower_dirs: Vec<&Path> = + ancestor_uppers.iter().map(PathBuf::as_path).collect(); + lower_dirs.push(&self.base_dir); + + let mount = OverlayMount::mount( + &lower_dirs, + &upper_dir, + &work_dir, + &merged_dir, + )?; + + let merged_path = merged_dir.clone(); + self.overlays.insert( + step_key.to_string(), + OverlayLayer { + upper_dir, + merged_dir, + ancestor_uppers, + _mount: Some(mount), + }, + ); + Ok(merged_path) + } +} + +impl std::fmt::Debug for WorkspaceManager { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("WorkspaceManager") + .field("run_dir", &self.run_dir) + .field("workspaces", &self.workspaces.keys().collect::>()) + .finish_non_exhaustive() + } +} + +fn sanitize_key(s: &str) -> String { + s.chars() + .map(|c| { + if c.is_ascii_alphanumeric() || c == '_' || c == '-' { + c + } else { + '-' + } + }) + .collect() +} + +fn extract_tar_gz(bytes: &[u8], dest: &Path) -> Result<()> { + use flate2::read::GzDecoder; + + let decoder = GzDecoder::new(bytes); + let mut archive = tar::Archive::new(decoder); + archive + .unpack(dest) + .with_context(|| format!("extract archive to {}", dest.display())) +} + +#[cfg(test)] +#[allow(clippy::unwrap_used)] +mod tests { + use super::*; + use std::fs; + + fn make_base(tmp: &std::path::Path) -> PathBuf { + let base = tmp.join("base"); + fs::create_dir(&base).unwrap(); + fs::write(base.join("main.rs"), b"fn main() {}").unwrap(); + base + } + + #[test] + fn root_step_clones_base() { + let tmp = tempfile::tempdir().unwrap(); + let base = make_base(tmp.path()); + let mut mgr = + WorkspaceManager::from_base(tmp.path().join("run"), base).unwrap(); + + let ws = mgr.create_workspace("build", None).unwrap(); + assert_eq!( + fs::read_to_string(ws.join("main.rs")).unwrap(), + "fn main() {}" + ); + } + + #[test] + fn child_step_inherits_parent_changes() { + let tmp = tempfile::tempdir().unwrap(); + let base = make_base(tmp.path()); + let mut mgr = + WorkspaceManager::from_base(tmp.path().join("run"), base).unwrap(); + + let ws_a = mgr.create_workspace("a", None).unwrap(); + fs::write(ws_a.join("artifact.bin"), b"built").unwrap(); + + let ws_b = mgr.create_workspace("b", Some("a")).unwrap(); + assert_eq!( + fs::read_to_string(ws_b.join("main.rs")).unwrap(), + "fn main() {}" + ); + assert_eq!( + fs::read_to_string(ws_b.join("artifact.bin")).unwrap(), + "built" + ); + } + + #[test] + fn fork_children_are_isolated() { + let tmp = tempfile::tempdir().unwrap(); + let base = make_base(tmp.path()); + let mut mgr = + WorkspaceManager::from_base(tmp.path().join("run"), base).unwrap(); + + let ws_a = mgr.create_workspace("a", None).unwrap(); + fs::write(ws_a.join("from_a"), b"a").unwrap(); + + let ws_b = mgr.create_workspace("b", Some("a")).unwrap(); + let ws_c = mgr.create_workspace("c", Some("a")).unwrap(); + + fs::write(ws_b.join("from_b"), b"b").unwrap(); + assert!(!ws_c.join("from_b").exists(), "c must not see b's changes"); + } + + #[test] + fn workspace_path_returns_created() { + let tmp = tempfile::tempdir().unwrap(); + let base = make_base(tmp.path()); + let mut mgr = + WorkspaceManager::from_base(tmp.path().join("run"), base).unwrap(); + + mgr.create_workspace("s", None).unwrap(); + assert!(mgr.workspace_path("s").is_some()); + assert!(mgr.workspace_path("nonexistent").is_none()); + } + + #[test] + fn cleanup_removes_run_dir() { + let tmp = tempfile::tempdir().unwrap(); + let base = make_base(tmp.path()); + let run_dir = tmp.path().join("run"); + let mut mgr = + WorkspaceManager::from_base(run_dir.clone(), base).unwrap(); + mgr.create_workspace("s", None).unwrap(); + assert!(run_dir.exists()); + + mgr.cleanup().unwrap(); + assert!(!run_dir.exists()); + } +} From ec61830afe885ed524ca31b922b8889dbf1133ca Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Tue, 26 May 2026 12:37:49 -0700 Subject: [PATCH 05/25] fix: code review fixes for Tasks 2 and 3 - Add duplicate step_key guard in WorkspaceManager - Add tests for create_workspace_from_cache and duplicate key - Deduplicate start_long_lived via delegation to start_long_lived_with_mounts --- crates/hm/src/orchestrator/docker_client.rs | 23 +-------------- crates/hm/src/orchestrator/workspace.rs | 31 +++++++++++++++++++++ 2 files changed, 32 insertions(+), 22 deletions(-) diff --git a/crates/hm/src/orchestrator/docker_client.rs b/crates/hm/src/orchestrator/docker_client.rs index 89210e2..6fefc28 100644 --- a/crates/hm/src/orchestrator/docker_client.rs +++ b/crates/hm/src/orchestrator/docker_client.rs @@ -156,29 +156,8 @@ impl DockerClient { workdir: &str, name: &str, ) -> Result { - let cfg = Config { - image: Some(image.to_string()), - cmd: Some(vec!["sh".into(), "-c".into(), "sleep infinity".into()]), - env: Some(env.to_vec()), - working_dir: Some(workdir.to_string()), - ..Default::default() - }; - let create = self - .inner - .create_container( - Some(CreateContainerOptions { - name, - ..Default::default() - }), - cfg, - ) + self.start_long_lived_with_mounts(image, env, workdir, name, &[]) .await - .map_err(|e| HmError::Docker(format!("create_container: {e}")))?; - self.inner - .start_container(&create.id, None::>) - .await - .map_err(|e| HmError::Docker(format!("start_container: {e}")))?; - Ok(create.id) } /// Like [`Self::start_long_lived`] but with bind mounts via `HostConfig`. diff --git a/crates/hm/src/orchestrator/workspace.rs b/crates/hm/src/orchestrator/workspace.rs index ada1439..f49e7d1 100644 --- a/crates/hm/src/orchestrator/workspace.rs +++ b/crates/hm/src/orchestrator/workspace.rs @@ -87,6 +87,9 @@ impl WorkspaceManager { step_key: &str, parent_key: Option<&str>, ) -> Result { + if self.workspaces.contains_key(step_key) || self.overlays.contains_key(step_key) { + anyhow::bail!("workspace for step '{step_key}' already exists"); + } match self.strategy { CowStrategy::FuseOverlay => self.create_overlay(step_key, parent_key), _ => self.create_clone(step_key, parent_key, None), @@ -342,6 +345,34 @@ mod tests { assert!(mgr.workspace_path("nonexistent").is_none()); } + #[test] + fn create_workspace_from_cache_clones_cached_dir() { + let tmp = tempfile::tempdir().unwrap(); + let base = make_base(tmp.path()); + let cached = tmp.path().join("cached"); + fs::create_dir(&cached).unwrap(); + fs::write(cached.join("cached_file.txt"), b"from_cache").unwrap(); + + let mut mgr = + WorkspaceManager::from_base(tmp.path().join("run"), base).unwrap(); + let ws = mgr.create_workspace_from_cache("s", &cached).unwrap(); + assert_eq!( + fs::read_to_string(ws.join("cached_file.txt")).unwrap(), + "from_cache" + ); + assert!(!ws.join("main.rs").exists()); + } + + #[test] + fn duplicate_step_key_errors() { + let tmp = tempfile::tempdir().unwrap(); + let base = make_base(tmp.path()); + let mut mgr = + WorkspaceManager::from_base(tmp.path().join("run"), base).unwrap(); + mgr.create_workspace("dup", None).unwrap(); + assert!(mgr.create_workspace("dup", None).is_err()); + } + #[test] fn cleanup_removes_run_dir() { let tmp = tempfile::tempdir().unwrap(); From 8daf6312f045d2b3bc5c98a447fd5d98e8e2e8c2 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Tue, 26 May 2026 12:40:25 -0700 Subject: [PATCH 06/25] feat(runner): add COW workspace execution path with bind mounts When RunContext.workspace is Some, DockerRunner dispatches to a new run_step_cow path that bind-mounts the host workspace directory into the container instead of extracting a tar archive and committing a docker snapshot. This eliminates the tar extraction and docker commit overhead for the COW execution mode. --- crates/hm/src/orchestrator/scheduler.rs | 1 + crates/hm/src/runner/docker.rs | 144 +++++++++++++++++++++++- crates/hm/src/runner/mod.rs | 6 +- 3 files changed, 149 insertions(+), 2 deletions(-) diff --git a/crates/hm/src/orchestrator/scheduler.rs b/crates/hm/src/orchestrator/scheduler.rs index 45e7209..b2985fa 100644 --- a/crates/hm/src/orchestrator/scheduler.rs +++ b/crates/hm/src/orchestrator/scheduler.rs @@ -96,6 +96,7 @@ pub async fn run( event_bus: bus.clone(), archives: archives.clone(), cancel: cancel.clone(), + workspace: None, }; let parallelism = parallelism.max(1); diff --git a/crates/hm/src/runner/docker.rs b/crates/hm/src/runner/docker.rs index 0b814a1..fe862e0 100644 --- a/crates/hm/src/runner/docker.rs +++ b/crates/hm/src/runner/docker.rs @@ -70,7 +70,13 @@ impl StepRunner for DockerRunner { input: ExecutorInput, ) -> Pin> + Send + '_>> { let ctx = ctx.clone(); - Box::pin(async move { run_step(&ctx, input).await }) + Box::pin(async move { + if ctx.workspace.is_some() { + run_step_cow(&ctx, input).await + } else { + run_step(&ctx, input).await + } + }) } } @@ -237,6 +243,128 @@ async fn run_in_container( }) } +// --------------------------------------------------------------------------- +// COW execution path +// --------------------------------------------------------------------------- + +/// Pick the base image for a COW step. +/// +/// In COW mode the workspace is bind-mounted, so there is no +/// parent-snapshot chain. We use the step's declared image or fall +/// back to `alpine:latest`. +fn resolve_image_cow(step: &CommandStep) -> String { + step.image + .clone() + .unwrap_or_else(|| "alpine:latest".to_string()) +} + +async fn run_step_cow(ctx: &RunContext, input: ExecutorInput) -> Result { + let plan = decision_plan(&input.cache_lookup); + + if !plan.run_command { + return Ok(StepResult { + exit_code: 0, + committed_snapshot: plan.hit_tag.clone(), + artifacts: vec![], + }); + } + + let workspace_mgr = ctx + .workspace + .as_ref() + .ok_or_else(|| anyhow::anyhow!("COW mode requires workspace manager"))?; + + let workspace_path = { + let mgr = workspace_mgr.lock().unwrap_or_else(|e| e.into_inner()); + mgr.workspace_path(&input.step.key) + .map(|p| p.to_path_buf()) + .ok_or_else(|| { + anyhow::anyhow!("workspace for step '{}' not created", input.step.key) + })? + }; + + let image = resolve_image_cow(&input.step); + let container_name = + sanitize_container_name(&input.run_id.to_string(), &input.step.key); + let env_vec: Vec = input + .env + .iter() + .map(|(k, v)| format!("{k}={v}")) + .collect(); + + // Pull image if needed. + if !ctx.docker.image_exists(&image).await.unwrap_or(false) { + let docker = ctx.docker.clone(); + let cancel = ctx.cancel.clone(); + let img = image.clone(); + let pull_fut = async move { docker.pull_image(&img).await }; + tokio::select! { + result = pull_fut => result.with_context(|| format!("pull '{image}'"))?, + () = cancel.cancelled() => anyhow::bail!("cancelled during image pull"), + } + } + + // Start container with workspace bind mount. + let binds = vec![format!("{}:/workspace", workspace_path.display())]; + let cid = ctx + .docker + .start_long_lived_with_mounts( + &image, + &env_vec, + &input.workdir, + &container_name, + &binds, + ) + .await + .context("docker start with mounts failed")?; + + let result = run_cow_in_container(ctx, &cid, &input, &env_vec).await; + ctx.docker.stop_remove(&cid).await; + result +} + +async fn run_cow_in_container( + ctx: &RunContext, + cid: &str, + input: &ExecutorInput, + env_vec: &[String], +) -> Result { + let mut writer = StepLogWriter::new(input.step_id, Arc::clone(&ctx.event_bus)); + let docker = ctx.docker.clone(); + let cancel = ctx.cancel.clone(); + let cid_owned = cid.to_owned(); + let cmd = vec!["sh".into(), "-c".into(), input.step.cmd.clone()]; + let workdir = input.workdir.clone(); + let env_owned = env_vec.to_vec(); + let exec_fut = async move { + let rc = docker + .exec_streaming(&cid_owned, &cmd, &env_owned, &workdir, &mut writer) + .await?; + writer.flush_remaining(); + Ok::(rc) + }; + + let rc = tokio::select! { + result = exec_fut => result.context("docker exec failed")?, + () = cancel.cancelled() => { + return Ok(StepResult { + exit_code: 130, + committed_snapshot: None, + artifacts: vec![], + }); + } + }; + + #[allow(clippy::cast_possible_truncation)] + let exit_code = rc as i32; + + Ok(StepResult { + exit_code, + committed_snapshot: None, + artifacts: vec![], + }) +} + // --------------------------------------------------------------------------- // DecisionPlan // --------------------------------------------------------------------------- @@ -413,6 +541,20 @@ mod tests { } } + // -- resolve_image_cow ---------------------------------------------------- + + #[test] + fn resolve_image_cow_uses_step_image() { + let s = step_with_image(Some("rust:1.82")); + assert_eq!(resolve_image_cow(&s), "rust:1.82"); + } + + #[test] + fn resolve_image_cow_fallback_alpine() { + let s = step_with_image(None); + assert_eq!(resolve_image_cow(&s), "alpine:latest"); + } + // -- resolve_image ------------------------------------------------------- #[test] diff --git a/crates/hm/src/runner/mod.rs b/crates/hm/src/runner/mod.rs index aefeec8..9e32b89 100644 --- a/crates/hm/src/runner/mod.rs +++ b/crates/hm/src/runner/mod.rs @@ -9,7 +9,7 @@ use std::collections::HashMap; use std::fmt; use std::future::Future; use std::pin::Pin; -use std::sync::Arc; +use std::sync::{Arc, Mutex}; use anyhow::Result; use hm_plugin_protocol::{BuildEvent, ExecutorInput, StepResult}; @@ -18,6 +18,7 @@ use tokio_util::sync::CancellationToken; use crate::orchestrator::archive::ArchiveStore; use crate::orchestrator::docker_client::DockerClient; use crate::orchestrator::events::EventBus; +use crate::orchestrator::workspace::WorkspaceManager; pub mod docker; @@ -36,6 +37,9 @@ pub struct RunContext { pub event_bus: Arc, pub archives: Arc, pub cancel: CancellationToken, + /// When present, steps use COW workspace bind mounts instead of + /// tar.gz extraction + docker commit. + pub workspace: Option>>, } // --------------------------------------------------------------------------- From 962db63efb13803db206bc00c7ac0ad39374c8e5 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Tue, 26 May 2026 12:43:24 -0700 Subject: [PATCH 07/25] fix(runner): use workdir for bind mount target, error on mutex poison --- crates/hm/src/runner/docker.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/crates/hm/src/runner/docker.rs b/crates/hm/src/runner/docker.rs index fe862e0..7187def 100644 --- a/crates/hm/src/runner/docker.rs +++ b/crates/hm/src/runner/docker.rs @@ -275,7 +275,9 @@ async fn run_step_cow(ctx: &RunContext, input: ExecutorInput) -> Result Result Date: Tue, 26 May 2026 12:46:12 -0700 Subject: [PATCH 08/25] feat: wire COW workspace mode through scheduler and CLI --- crates/hm/src/cli/run.rs | 4 +++ crates/hm/src/commands/run/local.rs | 3 ++- crates/hm/src/orchestrator/scheduler.rs | 36 +++++++++++++++++++++++-- 3 files changed, 40 insertions(+), 3 deletions(-) diff --git a/crates/hm/src/cli/run.rs b/crates/hm/src/cli/run.rs index 2ac4697..f145c68 100644 --- a/crates/hm/src/cli/run.rs +++ b/crates/hm/src/cli/run.rs @@ -42,4 +42,8 @@ pub struct RunArgs { /// Has no effect with `--format json`. #[arg(long)] pub logs: bool, + + /// Use COW workspace bind mounts instead of Docker image commits. + #[arg(long)] + pub cow: bool, } diff --git a/crates/hm/src/commands/run/local.rs b/crates/hm/src/commands/run/local.rs index 62523e4..93bb46d 100644 --- a/crates/hm/src/commands/run/local.rs +++ b/crates/hm/src/commands/run/local.rs @@ -84,6 +84,7 @@ pub async fn handle(args: RunArgs, ctx: RunContext) -> Result { }; let exit_code = - crate::orchestrator::run(graph, repo_root, parallelism, runner_registry, renderer).await?; + crate::orchestrator::run(graph, repo_root, parallelism, runner_registry, renderer, args.cow) + .await?; Ok(exit_code) } diff --git a/crates/hm/src/orchestrator/scheduler.rs b/crates/hm/src/orchestrator/scheduler.rs index b2985fa..f77781c 100644 --- a/crates/hm/src/orchestrator/scheduler.rs +++ b/crates/hm/src/orchestrator/scheduler.rs @@ -72,6 +72,7 @@ pub async fn run( parallelism: usize, runner_registry: Arc, renderer: Box, + cow: bool, ) -> Result { // Set up per-run state. let bus = EventBus::new(); @@ -89,6 +90,20 @@ pub async fn run( // Build the source archive once. let archive_bytes = build_archive_bytes(&repo_root).context("build source archive")?; + + // When COW mode is enabled, extract the source archive into a + // temporary directory and create a workspace manager that will + // produce per-step COW clones. This must happen before + // `archives.register()` which consumes the bytes. + let workspace = if cow { + let run_dir = std::env::temp_dir().join(format!("harmont-run-{run_id}")); + let mgr = super::workspace::WorkspaceManager::from_archive(run_dir, &archive_bytes) + .context("init COW workspace")?; + Some(Arc::new(std::sync::Mutex::new(mgr))) + } else { + None + }; + let archive_id = archives.register(archive_bytes); let run_ctx = RunContext { @@ -96,7 +111,7 @@ pub async fn run( event_bus: bus.clone(), archives: archives.clone(), cancel: cancel.clone(), - workspace: None, + workspace: workspace.clone(), }; let parallelism = parallelism.max(1); @@ -233,6 +248,14 @@ pub async fn run( } } + // Clean up the COW workspace tree if one was created. + if let Some(ref ws) = workspace { + let mut mgr = ws.lock().map_err(|_| anyhow::anyhow!("workspace manager mutex poisoned"))?; + if let Err(e) = mgr.cleanup() { + tracing::warn!(%e, "failed to clean up COW workspace"); + } + } + bus.emit(BuildEvent::BuildEnd { exit_code: overall, duration_ms: dur, @@ -283,7 +306,7 @@ async fn execute_step( step_id, key: step_key.clone(), chain_idx: chain_pos, - parent_key, + parent_key: parent_key.clone(), display_name: display_name.clone(), }); @@ -310,6 +333,15 @@ async fn execute_step( }); } + // Create a COW workspace for this step when running in COW mode. + if let Some(ref workspace) = run_ctx.workspace { + let mut mgr = workspace + .lock() + .map_err(|_| anyhow::anyhow!("workspace manager mutex poisoned"))?; + mgr.create_workspace(&step_key, parent_key.as_deref()) + .context("create workspace for step")?; + } + let input = ExecutorInput { step: step_wire, workspace_archive_id: archive_id, From 1791cc51d5b01adcd83ea6099e40b2c9ee9ecfca Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Tue, 26 May 2026 12:50:17 -0700 Subject: [PATCH 09/25] fix(scheduler): create COW workspace before cache-hit short-circuit --- crates/hm/src/orchestrator/scheduler.rs | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/crates/hm/src/orchestrator/scheduler.rs b/crates/hm/src/orchestrator/scheduler.rs index f77781c..6508ec1 100644 --- a/crates/hm/src/orchestrator/scheduler.rs +++ b/crates/hm/src/orchestrator/scheduler.rs @@ -314,6 +314,17 @@ async fn execute_step( let outcome = cache::decide(&run_ctx.docker, &step_wire).await?; let decision = outcome.decision; + // Create a COW workspace for this step when running in COW mode. + // This must happen before the cache-hit short-circuit so that + // downstream steps can clone from this step's workspace. + if let Some(ref workspace) = run_ctx.workspace { + let mut mgr = workspace + .lock() + .map_err(|_| anyhow::anyhow!("workspace manager mutex poisoned"))?; + mgr.create_workspace(&step_key, parent_key.as_deref()) + .context("create workspace for step")?; + } + if let hm_plugin_protocol::CacheDecision::Hit { tag } = &decision { bus.emit(BuildEvent::StepCacheHit { step_id, @@ -324,24 +335,12 @@ async fn execute_step( .unwrap_or_default(), tag: tag.0.clone(), }); - // Short-circuit: the cached image already exists locally, so - // there is nothing for the executor to do. Return the - // snapshot so downstream nodes can use it as their parent. return Ok(StepOutcome { exit_code: 0, snapshot: Some(tag.clone()), }); } - // Create a COW workspace for this step when running in COW mode. - if let Some(ref workspace) = run_ctx.workspace { - let mut mgr = workspace - .lock() - .map_err(|_| anyhow::anyhow!("workspace manager mutex poisoned"))?; - mgr.create_workspace(&step_key, parent_key.as_deref()) - .context("create workspace for step")?; - } - let input = ExecutorInput { step: step_wire, workspace_archive_id: archive_id, From e9b7ce6cf7eab62d401239cd6d6d6f3390de9952 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Tue, 26 May 2026 12:51:43 -0700 Subject: [PATCH 10/25] fix: skip Docker ephemeral cleanup in COW workspace mode --- crates/hm/src/orchestrator/scheduler.rs | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/crates/hm/src/orchestrator/scheduler.rs b/crates/hm/src/orchestrator/scheduler.rs index 6508ec1..e8449a2 100644 --- a/crates/hm/src/orchestrator/scheduler.rs +++ b/crates/hm/src/orchestrator/scheduler.rs @@ -235,16 +235,18 @@ pub async fn run( let dur = started_total.elapsed().as_millis() as u64; - // Clean up ephemeral images created during this run. - let ephemeral_tags: Vec<&str> = outcomes - .iter() - .filter_map(|o| o.snapshot.as_ref()) - .filter(|s| s.0.starts_with("harmont-local-ephemeral/")) - .map(|s| s.0.as_str()) - .collect(); - for tag in ephemeral_tags { - if let Err(e) = docker.remove_image(tag).await { - tracing::warn!(image = %tag, %e, "failed to remove ephemeral image"); + // Clean up ephemeral images (legacy mode only — COW mode has no Docker commits). + if !cow { + let ephemeral_tags: Vec<&str> = outcomes + .iter() + .filter_map(|o| o.snapshot.as_ref()) + .filter(|s| s.0.starts_with("harmont-local-ephemeral/")) + .map(|s| s.0.as_str()) + .collect(); + for tag in ephemeral_tags { + if let Err(e) = docker.remove_image(tag).await { + tracing::warn!(image = %tag, %e, "failed to remove ephemeral image"); + } } } From 08623c88856c3576cf8693eb2a1f220367d37c2a Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Tue, 26 May 2026 12:54:19 -0700 Subject: [PATCH 11/25] feat(cache): add COW workspace cache backend Persist completed step workspaces to ~/.harmont/cache/workspaces/ and restore them on cache hits, replacing Docker image-based caching when running in COW mode. Stale cache directories from previous keys are evicted after a successful build. --- crates/hm/src/orchestrator/cache.rs | 176 +++++++++++++++++++++++- crates/hm/src/orchestrator/scheduler.rs | 69 ++++++++-- 2 files changed, 232 insertions(+), 13 deletions(-) diff --git a/crates/hm/src/orchestrator/cache.rs b/crates/hm/src/orchestrator/cache.rs index e0bdc5a..78fe3c4 100644 --- a/crates/hm/src/orchestrator/cache.rs +++ b/crates/hm/src/orchestrator/cache.rs @@ -8,7 +8,9 @@ //! along the JSON in `cache.key`. We turn them into Docker image tags //! and consult the local image store. -use anyhow::Result; +use std::path::{Path, PathBuf}; + +use anyhow::{Context, Result}; use hm_plugin_protocol::{CacheDecision, CommandStep, SnapshotRef}; use crate::orchestrator::docker_client::DockerClient; @@ -98,6 +100,116 @@ pub async fn decide(docker: &DockerClient, step: &CommandStep) -> Result, + pub stale_dirs: Vec, +} + +/// Resolve the on-disk cache directory for a step's COW workspace. +/// +/// Returns `None` when the step has no cache, a `"none"` policy, or no +/// cache key — matching the same guard logic as [`cache_image_tag`]. +pub fn cow_cache_dir(step: &CommandStep) -> Result> { + let cache = match step.cache.as_ref() { + Some(c) if c.policy != "none" => c, + _ => return Ok(None), + }; + let key = match cache.key.as_deref() { + Some(k) => k, + None => return Ok(None), + }; + let base = hm_util::dirs::harmont_config_dir() + .ok_or_else(|| anyhow::anyhow!("cannot resolve ~/.harmont"))?; + let safe = sanitize_for_tag(&step.key); + let short = &key[..key.len().min(16)]; + Ok(Some( + base.join("cache") + .join("workspaces") + .join(safe) + .join(short), + )) +} + +/// Decide cache outcome for a step against the local COW workspace +/// cache directory. +/// +/// # Errors +/// Returns an error if the config directory cannot be resolved or the +/// stale directory listing fails. +pub fn decide_cow(step: &CommandStep) -> Result { + let Some(cache_dir) = cow_cache_dir(step)? else { + return Ok(CowCacheOutcome { + decision: CacheDecision::MissNoCommit, + cache_to: None, + stale_dirs: vec![], + }); + }; + if cache_dir.exists() { + Ok(CowCacheOutcome { + decision: CacheDecision::Hit { + tag: SnapshotRef::from(format!("cow:{}", cache_dir.display())), + }, + cache_to: None, + stale_dirs: vec![], + }) + } else { + let step_cache_root = cache_dir.parent().expect("cache_dir always has parent"); + let stale = if step_cache_root.exists() { + std::fs::read_dir(step_cache_root)? + .filter_map(|e| e.ok()) + .map(|e| e.path()) + .filter(|p| *p != cache_dir) + .collect() + } else { + vec![] + }; + Ok(CowCacheOutcome { + decision: CacheDecision::MissBuildAs { + tag: SnapshotRef::from(format!("cow:{}", cache_dir.display())), + }, + cache_to: Some(cache_dir), + stale_dirs: stale, + }) + } +} + +/// Persist a completed workspace directory into the COW cache. +/// +/// Creates intermediate directories and performs a COW clone. If the +/// cache directory already exists (e.g. a concurrent run beat us) the +/// function returns `Ok(())` without overwriting. +/// +/// # Errors +/// Returns an error if the parent directory cannot be created or the +/// COW clone fails. +pub fn persist_cow_cache(workspace_path: &Path, cache_dir: &Path) -> Result<()> { + if let Some(parent) = cache_dir.parent() { + std::fs::create_dir_all(parent)?; + } + if cache_dir.exists() { + return Ok(()); + } + hm_util::cow::cow_clone_dir(workspace_path, cache_dir) + .context("persist workspace to COW cache") +} + +/// Remove stale COW cache directories left over from previous cache +/// keys. Failures are logged but never propagated. +pub fn evict_stale_cow_dirs(dirs: &[PathBuf]) { + for dir in dirs { + if let Err(e) = std::fs::remove_dir_all(dir) { + tracing::warn!(path = %dir.display(), %e, "failed to evict stale COW cache"); + } + } +} + #[cfg(test)] #[allow(clippy::unwrap_used, clippy::expect_used, clippy::panic)] mod tests { @@ -148,4 +260,66 @@ mod tests { assert_eq!(sanitize_for_tag("simple"), "simple"); assert_eq!(sanitize_for_tag("a_b-c"), "a_b-c"); } + + // ------------------------------------------------------------------ + // COW workspace cache tests + // ------------------------------------------------------------------ + + #[test] + fn cow_cache_dir_returns_path_for_cacheable_step() { + let s = step(Some(Cache { + policy: "ttl".into(), + key: Some("0123456789abcdef0000".into()), + })); + let dir = cow_cache_dir(&s).unwrap(); + assert!(dir.is_some(), "expected Some for cacheable step"); + let dir = dir.unwrap(); + assert!( + dir.ends_with("cache/workspaces/build/0123456789abcdef"), + "unexpected path: {}", + dir.display() + ); + } + + #[test] + fn cow_cache_dir_returns_none_for_no_cache() { + let s = step(None); + let dir = cow_cache_dir(&s).unwrap(); + assert!(dir.is_none()); + } + + #[test] + fn cow_cache_dir_returns_none_for_policy_none() { + let s = step(Some(Cache { + policy: "none".into(), + key: Some("abcdef1234567890".into()), + })); + let dir = cow_cache_dir(&s).unwrap(); + assert!(dir.is_none()); + } + + #[test] + fn decide_cow_miss_no_commit_when_no_cache() { + let s = step(None); + let outcome = decide_cow(&s).unwrap(); + assert!(outcome.decision.is_miss_no_commit()); + assert!(outcome.cache_to.is_none()); + assert!(outcome.stale_dirs.is_empty()); + } + + #[test] + fn decide_cow_miss_build_as_for_new_key() { + // Use a unique key that will not exist on disk. + let s = step(Some(Cache { + policy: "ttl".into(), + key: Some("deadbeefcafebabe9999".into()), + })); + let outcome = decide_cow(&s).unwrap(); + assert!( + outcome.decision.is_miss_build_as(), + "expected MissBuildAs, got {:?}", + outcome.decision + ); + assert!(outcome.cache_to.is_some()); + } } diff --git a/crates/hm/src/orchestrator/scheduler.rs b/crates/hm/src/orchestrator/scheduler.rs index e8449a2..27b88b0 100644 --- a/crates/hm/src/orchestrator/scheduler.rs +++ b/crates/hm/src/orchestrator/scheduler.rs @@ -33,7 +33,7 @@ use futures::future::{BoxFuture, FutureExt, join_all}; use anyhow::{Context, Result}; use hm_plugin_protocol::{ - ArchiveId, BuildEvent, ExecutorInput, PlanSummary, SnapshotRef, StepResult, + ArchiveId, BuildEvent, CacheDecision, ExecutorInput, PlanSummary, SnapshotRef, StepResult, }; use uuid::Uuid; @@ -312,22 +312,45 @@ async fn execute_step( display_name: display_name.clone(), }); - // Decide cache outcome host-side. - let outcome = cache::decide(&run_ctx.docker, &step_wire).await?; - let decision = outcome.decision; + // Decide cache outcome host-side. In COW mode we check the + // workspace cache directory; otherwise we consult Docker images. + let (decision, cow_cache_to, cow_stale_dirs, docker_stale_tags) = + if run_ctx.workspace.is_some() { + let cow_outcome = cache::decide_cow(&step_wire)?; + ( + cow_outcome.decision, + cow_outcome.cache_to, + cow_outcome.stale_dirs, + vec![], + ) + } else { + let outcome = cache::decide(&run_ctx.docker, &step_wire).await?; + (outcome.decision, None, vec![], outcome.stale_tags) + }; // Create a COW workspace for this step when running in COW mode. - // This must happen before the cache-hit short-circuit so that - // downstream steps can clone from this step's workspace. + // On a COW cache hit we restore from the cached directory instead + // of cloning from the parent workspace. if let Some(ref workspace) = run_ctx.workspace { let mut mgr = workspace .lock() .map_err(|_| anyhow::anyhow!("workspace manager mutex poisoned"))?; - mgr.create_workspace(&step_key, parent_key.as_deref()) - .context("create workspace for step")?; + if let CacheDecision::Hit { ref tag } = decision { + if tag.0.starts_with("cow:") { + let cached_path = PathBuf::from(&tag.0[4..]); + mgr.create_workspace_from_cache(&step_key, &cached_path) + .context("create workspace from cache")?; + } else { + mgr.create_workspace(&step_key, parent_key.as_deref()) + .context("create workspace for cache hit")?; + } + } else { + mgr.create_workspace(&step_key, parent_key.as_deref()) + .context("create workspace for step")?; + } } - if let hm_plugin_protocol::CacheDecision::Hit { tag } = &decision { + if let CacheDecision::Hit { tag } = &decision { bus.emit(BuildEvent::StepCacheHit { step_id, key: step_wire @@ -405,9 +428,31 @@ async fn execute_step( }); cancel.cancel(); } else { - for stale in &outcome.stale_tags { - if let Err(e) = run_ctx.docker.remove_image(stale).await { - tracing::warn!(image = %stale, %e, "failed to evict stale cache image"); + // Persist to COW cache and evict stale entries on success. + if run_ctx.workspace.is_some() { + if let Some(ref cache_to) = cow_cache_to { + let ws_path = { + let mgr = run_ctx + .workspace + .as_ref() + .unwrap() + .lock() + .map_err(|_| anyhow::anyhow!("workspace manager mutex poisoned"))?; + mgr.workspace_path(&step_key).map(|p| p.to_path_buf()) + }; + if let Some(ws) = ws_path { + if let Err(e) = cache::persist_cow_cache(&ws, cache_to) { + tracing::warn!(%e, "failed to persist COW cache"); + } + } + } + cache::evict_stale_cow_dirs(&cow_stale_dirs); + } else { + // Docker mode: evict stale Docker cache images. + for stale in &docker_stale_tags { + if let Err(e) = run_ctx.docker.remove_image(stale).await { + tracing::warn!(image = %stale, %e, "failed to evict stale cache image"); + } } } } From d2ddb8811f132c0bd01467c48717dbba1a6dffda Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Tue, 26 May 2026 12:57:59 -0700 Subject: [PATCH 12/25] fix: handle TOCTOU race in cache persist, add dup-key guard to from_cache --- crates/hm/src/orchestrator/cache.rs | 10 ++++++++-- crates/hm/src/orchestrator/workspace.rs | 3 +++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/crates/hm/src/orchestrator/cache.rs b/crates/hm/src/orchestrator/cache.rs index 78fe3c4..dc6e81d 100644 --- a/crates/hm/src/orchestrator/cache.rs +++ b/crates/hm/src/orchestrator/cache.rs @@ -196,8 +196,14 @@ pub fn persist_cow_cache(workspace_path: &Path, cache_dir: &Path) -> Result<()> if cache_dir.exists() { return Ok(()); } - hm_util::cow::cow_clone_dir(workspace_path, cache_dir) - .context("persist workspace to COW cache") + match hm_util::cow::cow_clone_dir(workspace_path, cache_dir) { + Ok(()) => Ok(()), + Err(e) if cache_dir.exists() => { + tracing::debug!(%e, "concurrent run already populated cache"); + Ok(()) + } + Err(e) => Err(e).context("persist workspace to COW cache"), + } } /// Remove stale COW cache directories left over from previous cache diff --git a/crates/hm/src/orchestrator/workspace.rs b/crates/hm/src/orchestrator/workspace.rs index f49e7d1..5829cb0 100644 --- a/crates/hm/src/orchestrator/workspace.rs +++ b/crates/hm/src/orchestrator/workspace.rs @@ -107,6 +107,9 @@ impl WorkspaceManager { step_key: &str, cached_workspace: &Path, ) -> Result { + if self.workspaces.contains_key(step_key) || self.overlays.contains_key(step_key) { + anyhow::bail!("workspace for step '{step_key}' already exists"); + } self.create_clone(step_key, None, Some(cached_workspace)) } From 6f0f2ee731a82c38b3217155b20b44221d400dba Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Tue, 26 May 2026 12:59:13 -0700 Subject: [PATCH 13/25] feat(dirs): add cache directory helpers --- crates/hm-util/src/dirs.rs | 22 ++++++++++++++++++++++ crates/hm/src/orchestrator/cache.rs | 11 +++-------- 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/crates/hm-util/src/dirs.rs b/crates/hm-util/src/dirs.rs index 4838e3f..e4b1120 100644 --- a/crates/hm-util/src/dirs.rs +++ b/crates/hm-util/src/dirs.rs @@ -31,6 +31,16 @@ pub fn harmont_plugin_state_dir() -> Option { harmont_data_dir().map(|d| d.join("state")) } +/// `~/.harmont/cache/` — local build cache root. +pub fn harmont_cache_dir() -> Option { + harmont_config_dir().map(|h| h.join("cache")) +} + +/// `~/.harmont/cache/workspaces/` — COW workspace cache root. +pub fn harmont_workspace_cache_dir() -> Option { + harmont_cache_dir().map(|c| c.join("workspaces")) +} + #[cfg(test)] #[allow(clippy::unwrap_used)] mod tests { @@ -59,4 +69,16 @@ mod tests { let p = harmont_plugin_state_dir().unwrap(); assert!(p.ends_with("harmont/state")); } + + #[test] + fn harmont_cache_dir_resolves() { + let p = harmont_cache_dir().unwrap(); + assert!(p.to_string_lossy().contains("cache")); + } + + #[test] + fn harmont_workspace_cache_dir_resolves() { + let p = harmont_workspace_cache_dir().unwrap(); + assert!(p.to_string_lossy().contains("workspaces")); + } } diff --git a/crates/hm/src/orchestrator/cache.rs b/crates/hm/src/orchestrator/cache.rs index dc6e81d..271d3aa 100644 --- a/crates/hm/src/orchestrator/cache.rs +++ b/crates/hm/src/orchestrator/cache.rs @@ -125,16 +125,11 @@ pub fn cow_cache_dir(step: &CommandStep) -> Result> { Some(k) => k, None => return Ok(None), }; - let base = hm_util::dirs::harmont_config_dir() - .ok_or_else(|| anyhow::anyhow!("cannot resolve ~/.harmont"))?; + let ws_cache = hm_util::dirs::harmont_workspace_cache_dir() + .ok_or_else(|| anyhow::anyhow!("cannot resolve ~/.harmont/cache/workspaces"))?; let safe = sanitize_for_tag(&step.key); let short = &key[..key.len().min(16)]; - Ok(Some( - base.join("cache") - .join("workspaces") - .join(safe) - .join(short), - )) + Ok(Some(ws_cache.join(safe).join(short))) } /// Decide cache outcome for a step against the local COW workspace From a8d16f5ef3ed22558bcea062209f6c1a6223a54c Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Tue, 26 May 2026 13:02:25 -0700 Subject: [PATCH 14/25] test: add COW workspace E2E integration test --- crates/hm/tests/cow_workspace.rs | 75 +++++++++++++++++++ .../hm/tests/fixtures/pipelines/cow_chain.py | 10 +++ 2 files changed, 85 insertions(+) create mode 100644 crates/hm/tests/cow_workspace.rs create mode 100644 crates/hm/tests/fixtures/pipelines/cow_chain.py diff --git a/crates/hm/tests/cow_workspace.rs b/crates/hm/tests/cow_workspace.rs new file mode 100644 index 0000000..bcbdcdf --- /dev/null +++ b/crates/hm/tests/cow_workspace.rs @@ -0,0 +1,75 @@ +#![allow(clippy::unwrap_used, clippy::expect_used, clippy::panic)] +//! End-to-end test for COW workspace mode. +//! +//! Verifies that `hm run --cow` correctly propagates workspace state +//! across a three-step chain: step `a` writes a file, step `b` reads +//! it and writes another, step `c` reads both — proving COW workspace +//! inheritance works through the entire chain. +//! +//! Skipped unless `HARMONT_LOCAL_E2E=1` is set AND a Docker daemon is +//! reachable. +//! +//! ```sh +//! HARMONT_LOCAL_E2E=1 cargo test --test cow_workspace -- --test-threads=1 +//! ``` + +use std::path::PathBuf; +use std::process::Command; + +/// Returns true when the test should no-op. +fn skip_if_no_docker() -> bool { + if std::env::var_os("HARMONT_LOCAL_E2E").is_none() { + return true; + } + let out = Command::new("docker") + .args(["version", "--format", "{{.Server.Version}}"]) + .output(); + match out { + Ok(o) => !o.status.success(), + Err(_) => true, + } +} + +fn fixture(name: &str) -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("tests/fixtures/pipelines") + .join(name) +} + +#[test] +fn cow_chain_inherits_workspace() { + if skip_if_no_docker() { + return; + } + + let bin = env!("CARGO_BIN_EXE_hm"); + let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + let repo_root = manifest_dir + .parent() + .expect("CARGO_MANIFEST_DIR must have a parent (repo root)"); + + let tmp = tempfile::tempdir().expect("mktempdir"); + let harmont_dir = tmp.path().join(".harmont"); + std::fs::create_dir(&harmont_dir).expect("mkdir .harmont"); + std::fs::copy(fixture("cow_chain.py"), harmont_dir.join("cow_chain.py")) + .expect("copy fixture into .harmont/"); + + let out = Command::new(bin) + .args(["run", "--cow", "--logs", "--dir"]) + .arg(tmp.path()) + .arg("cow-chain") + .env("HARMONT_CIDSL_PY", repo_root.join("cidsl/py")) + .output() + .expect("spawning harmont binary should not fail"); + + let stderr = String::from_utf8_lossy(&out.stderr); + let stdout = String::from_utf8_lossy(&out.stdout); + assert!( + out.status.success(), + "hm run --cow failed.\nstdout:\n{stdout}\nstderr:\n{stderr}" + ); + assert!( + stderr.contains("c-saw-both"), + "step c must see files from a and b via COW workspace.\nstdout:\n{stdout}\nstderr:\n{stderr}" + ); +} diff --git a/crates/hm/tests/fixtures/pipelines/cow_chain.py b/crates/hm/tests/fixtures/pipelines/cow_chain.py new file mode 100644 index 0000000..70f3952 --- /dev/null +++ b/crates/hm/tests/fixtures/pipelines/cow_chain.py @@ -0,0 +1,10 @@ +"""COW workspace E2E fixture — three-step chain proving workspace inheritance.""" +import harmont as hm + + +@hm.pipeline("cow-chain", default_image="alpine:latest") +def cow_chain(): + a = hm.sh("echo from-a > /workspace/a.txt", label="a") + b = a.sh("cat /workspace/a.txt && echo from-b > /workspace/b.txt", label="b") + c = b.sh("cat /workspace/a.txt && cat /workspace/b.txt && echo c-saw-both", label="c") + return [c] From 26fe08d6d62df644076b183dc90c3e993d91e377 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Tue, 26 May 2026 13:56:39 -0700 Subject: [PATCH 15/25] style: apply rustfmt --- crates/hm-util/src/cow.rs | 12 ++--- crates/hm/src/commands/run/local.rs | 12 +++-- crates/hm/src/orchestrator/docker_client.rs | 7 +-- crates/hm/src/orchestrator/scheduler.rs | 40 +++++++------- crates/hm/src/orchestrator/workspace.rs | 58 +++++++-------------- crates/hm/src/runner/docker.rs | 21 ++------ 6 files changed, 60 insertions(+), 90 deletions(-) diff --git a/crates/hm-util/src/cow.rs b/crates/hm-util/src/cow.rs index ee798ec..affa08c 100644 --- a/crates/hm-util/src/cow.rs +++ b/crates/hm-util/src/cow.rs @@ -145,11 +145,8 @@ fn try_platform_cow(src: &Path, dst: &Path) -> Result { } fn copy_dir_recursive(src: &Path, dst: &Path) -> Result<()> { - std::fs::create_dir_all(dst) - .with_context(|| format!("create {}", dst.display()))?; - for entry in std::fs::read_dir(src) - .with_context(|| format!("read dir {}", src.display()))? - { + std::fs::create_dir_all(dst).with_context(|| format!("create {}", dst.display()))?; + for entry in std::fs::read_dir(src).with_context(|| format!("read dir {}", src.display()))? { let entry = entry?; let ty = entry.file_type()?; let src_path = entry.path(); @@ -250,7 +247,10 @@ impl OverlayMount { /// Returns an error if `fusermount` cannot be spawned or exits /// with a non-zero status. pub fn unmount(&self) -> Result<()> { - if !self.mounted.swap(false, std::sync::atomic::Ordering::AcqRel) { + if !self + .mounted + .swap(false, std::sync::atomic::Ordering::AcqRel) + { return Ok(()); } let bin = if which::which("fusermount3").is_ok() { diff --git a/crates/hm/src/commands/run/local.rs b/crates/hm/src/commands/run/local.rs index 93bb46d..6e7dfcb 100644 --- a/crates/hm/src/commands/run/local.rs +++ b/crates/hm/src/commands/run/local.rs @@ -83,8 +83,14 @@ pub async fn handle(args: RunArgs, ctx: RunContext) -> Result { other => anyhow::bail!("unknown --format '{other}'\n available: human, json"), }; - let exit_code = - crate::orchestrator::run(graph, repo_root, parallelism, runner_registry, renderer, args.cow) - .await?; + let exit_code = crate::orchestrator::run( + graph, + repo_root, + parallelism, + runner_registry, + renderer, + args.cow, + ) + .await?; Ok(exit_code) } diff --git a/crates/hm/src/orchestrator/docker_client.rs b/crates/hm/src/orchestrator/docker_client.rs index 6fefc28..208c9d0 100644 --- a/crates/hm/src/orchestrator/docker_client.rs +++ b/crates/hm/src/orchestrator/docker_client.rs @@ -14,11 +14,11 @@ use bollard::container::{ StopContainerOptions, }; use bollard::exec::{CreateExecOptions, StartExecResults}; -use bollard::models::HostConfig; use bollard::image::{ CommitContainerOptions, CreateImageOptions, ImportImageOptions, ListImagesOptions, RemoveImageOptions, }; +use bollard::models::HostConfig; use futures_util::StreamExt; use tokio::io::AsyncWrite; @@ -576,10 +576,7 @@ mod smoke { #[test] fn build_host_config_with_binds_and_no_caps() { - let hc = super::build_host_config( - &["/host/path:/container/path".to_string()], - &[], - ); + let hc = super::build_host_config(&["/host/path:/container/path".to_string()], &[]); assert_eq!( hc.binds.as_ref().unwrap(), &["/host/path:/container/path".to_string()] diff --git a/crates/hm/src/orchestrator/scheduler.rs b/crates/hm/src/orchestrator/scheduler.rs index 27b88b0..d4d7276 100644 --- a/crates/hm/src/orchestrator/scheduler.rs +++ b/crates/hm/src/orchestrator/scheduler.rs @@ -252,7 +252,9 @@ pub async fn run( // Clean up the COW workspace tree if one was created. if let Some(ref ws) = workspace { - let mut mgr = ws.lock().map_err(|_| anyhow::anyhow!("workspace manager mutex poisoned"))?; + let mut mgr = ws + .lock() + .map_err(|_| anyhow::anyhow!("workspace manager mutex poisoned"))?; if let Err(e) = mgr.cleanup() { tracing::warn!(%e, "failed to clean up COW workspace"); } @@ -314,19 +316,19 @@ async fn execute_step( // Decide cache outcome host-side. In COW mode we check the // workspace cache directory; otherwise we consult Docker images. - let (decision, cow_cache_to, cow_stale_dirs, docker_stale_tags) = - if run_ctx.workspace.is_some() { - let cow_outcome = cache::decide_cow(&step_wire)?; - ( - cow_outcome.decision, - cow_outcome.cache_to, - cow_outcome.stale_dirs, - vec![], - ) - } else { - let outcome = cache::decide(&run_ctx.docker, &step_wire).await?; - (outcome.decision, None, vec![], outcome.stale_tags) - }; + let (decision, cow_cache_to, cow_stale_dirs, docker_stale_tags) = if run_ctx.workspace.is_some() + { + let cow_outcome = cache::decide_cow(&step_wire)?; + ( + cow_outcome.decision, + cow_outcome.cache_to, + cow_outcome.stale_dirs, + vec![], + ) + } else { + let outcome = cache::decide(&run_ctx.docker, &step_wire).await?; + (outcome.decision, None, vec![], outcome.stale_tags) + }; // Create a COW workspace for this step when running in COW mode. // On a COW cache hit we restore from the cached directory instead @@ -432,12 +434,10 @@ async fn execute_step( if run_ctx.workspace.is_some() { if let Some(ref cache_to) = cow_cache_to { let ws_path = { - let mgr = run_ctx - .workspace - .as_ref() - .unwrap() - .lock() - .map_err(|_| anyhow::anyhow!("workspace manager mutex poisoned"))?; + let mgr = + run_ctx.workspace.as_ref().unwrap().lock().map_err(|_| { + anyhow::anyhow!("workspace manager mutex poisoned") + })?; mgr.workspace_path(&step_key).map(|p| p.to_path_buf()) }; if let Some(ws) = ws_path { diff --git a/crates/hm/src/orchestrator/workspace.rs b/crates/hm/src/orchestrator/workspace.rs index 5829cb0..42b88b7 100644 --- a/crates/hm/src/orchestrator/workspace.rs +++ b/crates/hm/src/orchestrator/workspace.rs @@ -119,9 +119,7 @@ impl WorkspaceManager { if let Some(p) = self.workspaces.get(step_key) { return Some(p.as_path()); } - self.overlays - .get(step_key) - .map(|l| l.merged_dir.as_path()) + self.overlays.get(step_key).map(|l| l.merged_dir.as_path()) } /// The base directory that root workspaces are cloned from. @@ -146,9 +144,8 @@ impl WorkspaceManager { // Drop overlay mounts before removing the filesystem tree. self.overlays.clear(); if self.run_dir.exists() { - std::fs::remove_dir_all(&self.run_dir).with_context(|| { - format!("cleanup run dir {}", self.run_dir.display()) - })?; + std::fs::remove_dir_all(&self.run_dir) + .with_context(|| format!("cleanup run dir {}", self.run_dir.display()))?; } Ok(()) } @@ -177,9 +174,8 @@ impl WorkspaceManager { self.base_dir.clone() }; - hm_util::cow::cow_clone_dir(&source, &ws_dir).with_context(|| { - format!("cow clone {} -> {}", source.display(), ws_dir.display()) - })?; + hm_util::cow::cow_clone_dir(&source, &ws_dir) + .with_context(|| format!("cow clone {} -> {}", source.display(), ws_dir.display()))?; self.workspaces.insert(step_key.to_string(), ws_dir.clone()); Ok(ws_dir) @@ -189,11 +185,7 @@ impl WorkspaceManager { // Overlay strategy // ------------------------------------------------------------------ - fn create_overlay( - &mut self, - step_key: &str, - parent_key: Option<&str>, - ) -> Result { + fn create_overlay(&mut self, step_key: &str, parent_key: Option<&str>) -> Result { let safe = sanitize_key(step_key); let layer_dir = self.run_dir.join("layers").join(&safe); let upper_dir = layer_dir.join("upper"); @@ -205,9 +197,10 @@ impl WorkspaceManager { std::fs::create_dir_all(&merged_dir)?; let ancestor_uppers = if let Some(pk) = parent_key { - let parent = self.overlays.get(pk).ok_or_else(|| { - anyhow::anyhow!("parent overlay '{pk}' not registered") - })?; + let parent = self + .overlays + .get(pk) + .ok_or_else(|| anyhow::anyhow!("parent overlay '{pk}' not registered"))?; let mut ancestors = vec![parent.upper_dir.clone()]; ancestors.extend(parent.ancestor_uppers.iter().cloned()); ancestors @@ -215,16 +208,10 @@ impl WorkspaceManager { vec![] }; - let mut lower_dirs: Vec<&Path> = - ancestor_uppers.iter().map(PathBuf::as_path).collect(); + let mut lower_dirs: Vec<&Path> = ancestor_uppers.iter().map(PathBuf::as_path).collect(); lower_dirs.push(&self.base_dir); - let mount = OverlayMount::mount( - &lower_dirs, - &upper_dir, - &work_dir, - &merged_dir, - )?; + let mount = OverlayMount::mount(&lower_dirs, &upper_dir, &work_dir, &merged_dir)?; let merged_path = merged_dir.clone(); self.overlays.insert( @@ -288,8 +275,7 @@ mod tests { fn root_step_clones_base() { let tmp = tempfile::tempdir().unwrap(); let base = make_base(tmp.path()); - let mut mgr = - WorkspaceManager::from_base(tmp.path().join("run"), base).unwrap(); + let mut mgr = WorkspaceManager::from_base(tmp.path().join("run"), base).unwrap(); let ws = mgr.create_workspace("build", None).unwrap(); assert_eq!( @@ -302,8 +288,7 @@ mod tests { fn child_step_inherits_parent_changes() { let tmp = tempfile::tempdir().unwrap(); let base = make_base(tmp.path()); - let mut mgr = - WorkspaceManager::from_base(tmp.path().join("run"), base).unwrap(); + let mut mgr = WorkspaceManager::from_base(tmp.path().join("run"), base).unwrap(); let ws_a = mgr.create_workspace("a", None).unwrap(); fs::write(ws_a.join("artifact.bin"), b"built").unwrap(); @@ -323,8 +308,7 @@ mod tests { fn fork_children_are_isolated() { let tmp = tempfile::tempdir().unwrap(); let base = make_base(tmp.path()); - let mut mgr = - WorkspaceManager::from_base(tmp.path().join("run"), base).unwrap(); + let mut mgr = WorkspaceManager::from_base(tmp.path().join("run"), base).unwrap(); let ws_a = mgr.create_workspace("a", None).unwrap(); fs::write(ws_a.join("from_a"), b"a").unwrap(); @@ -340,8 +324,7 @@ mod tests { fn workspace_path_returns_created() { let tmp = tempfile::tempdir().unwrap(); let base = make_base(tmp.path()); - let mut mgr = - WorkspaceManager::from_base(tmp.path().join("run"), base).unwrap(); + let mut mgr = WorkspaceManager::from_base(tmp.path().join("run"), base).unwrap(); mgr.create_workspace("s", None).unwrap(); assert!(mgr.workspace_path("s").is_some()); @@ -356,8 +339,7 @@ mod tests { fs::create_dir(&cached).unwrap(); fs::write(cached.join("cached_file.txt"), b"from_cache").unwrap(); - let mut mgr = - WorkspaceManager::from_base(tmp.path().join("run"), base).unwrap(); + let mut mgr = WorkspaceManager::from_base(tmp.path().join("run"), base).unwrap(); let ws = mgr.create_workspace_from_cache("s", &cached).unwrap(); assert_eq!( fs::read_to_string(ws.join("cached_file.txt")).unwrap(), @@ -370,8 +352,7 @@ mod tests { fn duplicate_step_key_errors() { let tmp = tempfile::tempdir().unwrap(); let base = make_base(tmp.path()); - let mut mgr = - WorkspaceManager::from_base(tmp.path().join("run"), base).unwrap(); + let mut mgr = WorkspaceManager::from_base(tmp.path().join("run"), base).unwrap(); mgr.create_workspace("dup", None).unwrap(); assert!(mgr.create_workspace("dup", None).is_err()); } @@ -381,8 +362,7 @@ mod tests { let tmp = tempfile::tempdir().unwrap(); let base = make_base(tmp.path()); let run_dir = tmp.path().join("run"); - let mut mgr = - WorkspaceManager::from_base(run_dir.clone(), base).unwrap(); + let mut mgr = WorkspaceManager::from_base(run_dir.clone(), base).unwrap(); mgr.create_workspace("s", None).unwrap(); assert!(run_dir.exists()); diff --git a/crates/hm/src/runner/docker.rs b/crates/hm/src/runner/docker.rs index 7187def..2c86e0d 100644 --- a/crates/hm/src/runner/docker.rs +++ b/crates/hm/src/runner/docker.rs @@ -280,19 +280,12 @@ async fn run_step_cow(ctx: &RunContext, input: ExecutorInput) -> Result = input - .env - .iter() - .map(|(k, v)| format!("{k}={v}")) - .collect(); + let container_name = sanitize_container_name(&input.run_id.to_string(), &input.step.key); + let env_vec: Vec = input.env.iter().map(|(k, v)| format!("{k}={v}")).collect(); // Pull image if needed. if !ctx.docker.image_exists(&image).await.unwrap_or(false) { @@ -310,13 +303,7 @@ async fn run_step_cow(ctx: &RunContext, input: ExecutorInput) -> Result Date: Tue, 26 May 2026 14:05:10 -0700 Subject: [PATCH 16/25] fix: resolve all clippy warnings in COW workspace code manual_let_else, missing_errors_doc, missing_panics_doc, expect_used, redundant_closure_for_method_calls, unnecessary_unwrap, collapsible_if. --- crates/hm-util/src/cow.rs | 8 ++++---- crates/hm/src/orchestrator/cache.rs | 20 +++++++++++++++----- crates/hm/src/orchestrator/scheduler.rs | 19 +++++++++---------- crates/hm/src/runner/docker.rs | 2 +- 4 files changed, 29 insertions(+), 20 deletions(-) diff --git a/crates/hm-util/src/cow.rs b/crates/hm-util/src/cow.rs index affa08c..d5e5238 100644 --- a/crates/hm-util/src/cow.rs +++ b/crates/hm-util/src/cow.rs @@ -26,6 +26,7 @@ pub fn detect_strategy() -> CowStrategy { *STRATEGY.get_or_init(detect_strategy_inner) } +#[allow(clippy::missing_const_for_fn)] fn detect_strategy_inner() -> CowStrategy { #[cfg(target_os = "macos")] { @@ -49,9 +50,8 @@ fn detect_strategy_inner() -> CowStrategy { #[cfg(target_os = "linux")] fn probe_reflink() -> bool { - let tmp = match tempfile::tempdir() { - Ok(t) => t, - Err(_) => return false, + let Ok(tmp) = tempfile::tempdir() else { + return false; }; let src = tmp.path().join("src"); let dst = tmp.path().join("dst"); @@ -276,7 +276,7 @@ impl std::fmt::Debug for OverlayMount { f.debug_struct("OverlayMount") .field("merged", &self.merged) .field("upper", &self.upper) - .finish() + .finish_non_exhaustive() } } diff --git a/crates/hm/src/orchestrator/cache.rs b/crates/hm/src/orchestrator/cache.rs index 271d3aa..4496c47 100644 --- a/crates/hm/src/orchestrator/cache.rs +++ b/crates/hm/src/orchestrator/cache.rs @@ -116,14 +116,16 @@ pub struct CowCacheOutcome { /// /// Returns `None` when the step has no cache, a `"none"` policy, or no /// cache key — matching the same guard logic as [`cache_image_tag`]. +/// +/// # Errors +/// Returns an error if the config directory cannot be resolved. pub fn cow_cache_dir(step: &CommandStep) -> Result> { let cache = match step.cache.as_ref() { Some(c) if c.policy != "none" => c, _ => return Ok(None), }; - let key = match cache.key.as_deref() { - Some(k) => k, - None => return Ok(None), + let Some(key) = cache.key.as_deref() else { + return Ok(None); }; let ws_cache = hm_util::dirs::harmont_workspace_cache_dir() .ok_or_else(|| anyhow::anyhow!("cannot resolve ~/.harmont/cache/workspaces"))?; @@ -155,10 +157,18 @@ pub fn decide_cow(step: &CommandStep) -> Result { stale_dirs: vec![], }) } else { - let step_cache_root = cache_dir.parent().expect("cache_dir always has parent"); + let Some(step_cache_root) = cache_dir.parent() else { + return Ok(CowCacheOutcome { + decision: CacheDecision::MissBuildAs { + tag: SnapshotRef::from(format!("cow:{}", cache_dir.display())), + }, + cache_to: Some(cache_dir), + stale_dirs: vec![], + }); + }; let stale = if step_cache_root.exists() { std::fs::read_dir(step_cache_root)? - .filter_map(|e| e.ok()) + .filter_map(std::result::Result::ok) .map(|e| e.path()) .filter(|p| *p != cache_dir) .collect() diff --git a/crates/hm/src/orchestrator/scheduler.rs b/crates/hm/src/orchestrator/scheduler.rs index d4d7276..86672a7 100644 --- a/crates/hm/src/orchestrator/scheduler.rs +++ b/crates/hm/src/orchestrator/scheduler.rs @@ -431,19 +431,18 @@ async fn execute_step( cancel.cancel(); } else { // Persist to COW cache and evict stale entries on success. - if run_ctx.workspace.is_some() { + if let Some(ref ws_mgr) = run_ctx.workspace { if let Some(ref cache_to) = cow_cache_to { let ws_path = { - let mgr = - run_ctx.workspace.as_ref().unwrap().lock().map_err(|_| { - anyhow::anyhow!("workspace manager mutex poisoned") - })?; - mgr.workspace_path(&step_key).map(|p| p.to_path_buf()) + let mgr = ws_mgr.lock().map_err(|_| { + anyhow::anyhow!("workspace manager mutex poisoned") + })?; + mgr.workspace_path(&step_key).map(std::path::Path::to_path_buf) }; - if let Some(ws) = ws_path { - if let Err(e) = cache::persist_cow_cache(&ws, cache_to) { - tracing::warn!(%e, "failed to persist COW cache"); - } + if let Some(ws) = ws_path + && let Err(e) = cache::persist_cow_cache(&ws, cache_to) + { + tracing::warn!(%e, "failed to persist COW cache"); } } cache::evict_stale_cow_dirs(&cow_stale_dirs); diff --git a/crates/hm/src/runner/docker.rs b/crates/hm/src/runner/docker.rs index 2c86e0d..bf13e14 100644 --- a/crates/hm/src/runner/docker.rs +++ b/crates/hm/src/runner/docker.rs @@ -279,7 +279,7 @@ async fn run_step_cow(ctx: &RunContext, input: ExecutorInput) -> Result Date: Tue, 26 May 2026 14:06:50 -0700 Subject: [PATCH 17/25] style: fix rustfmt in scheduler COW cache block --- crates/hm/src/orchestrator/scheduler.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/crates/hm/src/orchestrator/scheduler.rs b/crates/hm/src/orchestrator/scheduler.rs index 86672a7..d580c12 100644 --- a/crates/hm/src/orchestrator/scheduler.rs +++ b/crates/hm/src/orchestrator/scheduler.rs @@ -434,10 +434,11 @@ async fn execute_step( if let Some(ref ws_mgr) = run_ctx.workspace { if let Some(ref cache_to) = cow_cache_to { let ws_path = { - let mgr = ws_mgr.lock().map_err(|_| { - anyhow::anyhow!("workspace manager mutex poisoned") - })?; - mgr.workspace_path(&step_key).map(std::path::Path::to_path_buf) + let mgr = ws_mgr + .lock() + .map_err(|_| anyhow::anyhow!("workspace manager mutex poisoned"))?; + mgr.workspace_path(&step_key) + .map(std::path::Path::to_path_buf) }; if let Some(ws) = ws_path && let Err(e) = cache::persist_cow_cache(&ws, cache_to) From 0fe24f2d024d8949e783c2bc3188165c9296c497 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Tue, 26 May 2026 16:53:15 -0700 Subject: [PATCH 18/25] =?UTF-8?q?feat(cow):=20hybrid=20mode=20=E2=80=94=20?= =?UTF-8?q?commit=20Docker=20images=20for=20system=20state=20inheritance?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit COW mode now commits container images after each step, so child steps inherit system-level changes (apt-get installs, rustup, etc.) via Docker image lineage while workspace files propagate via COW bind mounts. --- crates/hm/src/orchestrator/scheduler.rs | 22 +++--- crates/hm/src/runner/docker.rs | 89 ++++++++++++++++++++++--- 2 files changed, 89 insertions(+), 22 deletions(-) diff --git a/crates/hm/src/orchestrator/scheduler.rs b/crates/hm/src/orchestrator/scheduler.rs index d580c12..4ab584e 100644 --- a/crates/hm/src/orchestrator/scheduler.rs +++ b/crates/hm/src/orchestrator/scheduler.rs @@ -235,18 +235,16 @@ pub async fn run( let dur = started_total.elapsed().as_millis() as u64; - // Clean up ephemeral images (legacy mode only — COW mode has no Docker commits). - if !cow { - let ephemeral_tags: Vec<&str> = outcomes - .iter() - .filter_map(|o| o.snapshot.as_ref()) - .filter(|s| s.0.starts_with("harmont-local-ephemeral/")) - .map(|s| s.0.as_str()) - .collect(); - for tag in ephemeral_tags { - if let Err(e) = docker.remove_image(tag).await { - tracing::warn!(image = %tag, %e, "failed to remove ephemeral image"); - } + // Clean up ephemeral images created during the run. + let ephemeral_tags: Vec<&str> = outcomes + .iter() + .filter_map(|o| o.snapshot.as_ref()) + .filter(|s| s.0.starts_with("harmont-local-ephemeral/")) + .map(|s| s.0.as_str()) + .collect(); + for tag in ephemeral_tags { + if let Err(e) = docker.remove_image(tag).await { + tracing::warn!(image = %tag, %e, "failed to remove ephemeral image"); } } diff --git a/crates/hm/src/runner/docker.rs b/crates/hm/src/runner/docker.rs index bf13e14..4b5266b 100644 --- a/crates/hm/src/runner/docker.rs +++ b/crates/hm/src/runner/docker.rs @@ -249,10 +249,12 @@ async fn run_in_container( /// Pick the base image for a COW step. /// -/// In COW mode the workspace is bind-mounted, so there is no -/// parent-snapshot chain. We use the step's declared image or fall -/// back to `alpine:latest`. -fn resolve_image_cow(step: &CommandStep) -> String { +/// In COW mode the workspace is bind-mounted, but we still need +/// parent snapshots for system-level state (installed packages). +fn resolve_image_cow(step: &CommandStep, input: &ExecutorInput) -> String { + if let Some(snap) = &input.parent_snapshot { + return snap.to_string(); + } step.image .clone() .unwrap_or_else(|| "alpine:latest".to_string()) @@ -283,7 +285,7 @@ async fn run_step_cow(ctx: &RunContext, input: ExecutorInput) -> Result = input.env.iter().map(|(k, v)| format!("{k}={v}")).collect(); @@ -307,7 +309,7 @@ async fn run_step_cow(ctx: &RunContext, input: ExecutorInput) -> Result Result { let mut writer = StepLogWriter::new(input.step_id, Arc::clone(&ctx.event_bus)); let docker = ctx.docker.clone(); @@ -344,12 +347,52 @@ async fn run_cow_in_container( } }; - #[allow(clippy::cast_possible_truncation)] + #[allow( + clippy::cast_possible_truncation, + reason = "docker exit codes fit in i32" + )] let exit_code = rc as i32; + // Commit container so child steps inherit system-level changes + // (installed packages, etc.). Workspace files propagate via COW + // bind mounts, but the container image captures everything else. + let committed = if exit_code == 0 { + let target_tag = plan.commit_to.clone().unwrap_or_else(|| { + let safe: String = input + .step + .key + .chars() + .map(|c| { + if c.is_ascii_alphanumeric() || c == '_' || c == '-' { + c + } else { + '-' + } + }) + .collect(); + SnapshotRef::from(format!( + "harmont-local-ephemeral/{safe}:run-{}", + input.step_id.simple() + )) + }); + match ctx + .docker + .commit_container(cid, &target_tag.to_string()) + .await + { + Ok(_) => Some(target_tag), + Err(e) => { + tracing::warn!(step_key = %input.step.key, "docker commit failed, step still succeeded: {e:#}"); + None + } + } + } else { + None + }; + Ok(StepResult { exit_code, - committed_snapshot: None, + committed_snapshot: committed, artifacts: vec![], }) } @@ -530,18 +573,44 @@ mod tests { } } + fn make_input(step: CommandStep, parent_snapshot: Option) -> ExecutorInput { + ExecutorInput { + step, + workspace_archive_id: hm_plugin_protocol::ArchiveId::from(uuid::Uuid::nil()), + env: std::collections::BTreeMap::new(), + workdir: "/workspace".into(), + run_id: uuid::Uuid::nil(), + step_id: uuid::Uuid::nil(), + cache_lookup: CacheDecision::MissNoCommit, + parent_snapshot, + } + } + // -- resolve_image_cow ---------------------------------------------------- #[test] fn resolve_image_cow_uses_step_image() { let s = step_with_image(Some("rust:1.82")); - assert_eq!(resolve_image_cow(&s), "rust:1.82"); + let input = make_input(s.clone(), None); + assert_eq!(resolve_image_cow(&s, &input), "rust:1.82"); } #[test] fn resolve_image_cow_fallback_alpine() { let s = step_with_image(None); - assert_eq!(resolve_image_cow(&s), "alpine:latest"); + let input = make_input(s.clone(), None); + assert_eq!(resolve_image_cow(&s, &input), "alpine:latest"); + } + + #[test] + fn resolve_image_cow_prefers_parent_snapshot() { + let s = step_with_image(Some("rust:1.82")); + let snap = SnapshotRef::from("harmont-local-ephemeral/base:abc123".to_string()); + let input = make_input(s.clone(), Some(snap)); + assert_eq!( + resolve_image_cow(&s, &input), + "harmont-local-ephemeral/base:abc123" + ); } // -- resolve_image ------------------------------------------------------- From 521c4b3471d7876333b55d838ad205e1018c41c3 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Tue, 26 May 2026 22:25:12 -0700 Subject: [PATCH 19/25] refactor: make COW workspace the only execution path Remove --cow flag, non-COW Docker tar-extract path, Docker-image-only cache backend, and associated dead code. COW bind mounts + Docker image commits for system state is now the default and only mode. -465 lines removed. --- crates/hm/src/cli/run.rs | 4 - crates/hm/src/commands/run/local.rs | 11 +- crates/hm/src/orchestrator/cache.rs | 112 +-------- crates/hm/src/orchestrator/scheduler.rs | 85 +++---- crates/hm/src/runner/docker.rs | 304 ++---------------------- crates/hm/src/runner/mod.rs | 4 +- 6 files changed, 55 insertions(+), 465 deletions(-) diff --git a/crates/hm/src/cli/run.rs b/crates/hm/src/cli/run.rs index f145c68..2ac4697 100644 --- a/crates/hm/src/cli/run.rs +++ b/crates/hm/src/cli/run.rs @@ -42,8 +42,4 @@ pub struct RunArgs { /// Has no effect with `--format json`. #[arg(long)] pub logs: bool, - - /// Use COW workspace bind mounts instead of Docker image commits. - #[arg(long)] - pub cow: bool, } diff --git a/crates/hm/src/commands/run/local.rs b/crates/hm/src/commands/run/local.rs index 6e7dfcb..62523e4 100644 --- a/crates/hm/src/commands/run/local.rs +++ b/crates/hm/src/commands/run/local.rs @@ -83,14 +83,7 @@ pub async fn handle(args: RunArgs, ctx: RunContext) -> Result { other => anyhow::bail!("unknown --format '{other}'\n available: human, json"), }; - let exit_code = crate::orchestrator::run( - graph, - repo_root, - parallelism, - runner_registry, - renderer, - args.cow, - ) - .await?; + let exit_code = + crate::orchestrator::run(graph, repo_root, parallelism, runner_registry, renderer).await?; Ok(exit_code) } diff --git a/crates/hm/src/orchestrator/cache.rs b/crates/hm/src/orchestrator/cache.rs index 4496c47..1e3208e 100644 --- a/crates/hm/src/orchestrator/cache.rs +++ b/crates/hm/src/orchestrator/cache.rs @@ -1,41 +1,17 @@ //! Host-side cache decision. //! -//! Resolves a wire-typed [`CommandStep`] against the local Docker -//! daemon and returns the wire-typed [`CacheDecision`] consumed by -//! step-executor plugins (design spec §5.5). +//! Resolves a wire-typed [`CommandStep`] against the local COW +//! workspace cache directory and returns the wire-typed +//! [`CacheDecision`] consumed by step execution. //! //! Cache keys are computed by `harmont.keygen` at plan time and ride -//! along the JSON in `cache.key`. We turn them into Docker image tags -//! and consult the local image store. +//! along the JSON in `cache.key`. use std::path::{Path, PathBuf}; use anyhow::{Context, Result}; use hm_plugin_protocol::{CacheDecision, CommandStep, SnapshotRef}; -use crate::orchestrator::docker_client::DockerClient; - -/// `harmont-local/:`. Step key is -/// sanitised to `[a-zA-Z0-9_-]` (Docker tag rules). Returns `None` -/// when the step has no cache or a policy of `"none"`. -/// -/// The cache key is the SHA-256 hex resolved at plan time by -/// `harmont.keygen`. We truncate to the first 16 hex chars (8 bytes) -/// for the image tag — collision odds across a developer's local -/// cache are negligible. The cloud path uses the full key elsewhere; -/// that divergence is acceptable for local-only tags since they're -/// never resolved across machines. -fn cache_image_tag(step: &CommandStep) -> Option { - let cache = step.cache.as_ref()?; - if cache.policy == "none" { - return None; - } - let key = cache.key.as_deref()?; - let safe = sanitize_for_tag(&step.key); - let short = &key[..key.len().min(16)]; - Some(format!("harmont-local/{safe}:{short}")) -} - fn sanitize_for_tag(s: &str) -> String { s.chars() .map(|c| { @@ -48,58 +24,6 @@ fn sanitize_for_tag(s: &str) -> String { .collect() } -/// The outcome of a cache lookup: the wire-typed decision plus any -/// stale images that should be garbage-collected after the new image -/// is committed. -#[derive(Debug)] -pub struct CacheOutcome { - pub decision: CacheDecision, - /// Stale cache images for this step that should be removed after - /// the new image is committed successfully. - pub stale_tags: Vec, -} - -/// Decide cache outcome for a step against the local Docker daemon. -/// -/// Returns hit (snapshot already present), miss-with-tag (run and commit -/// afterwards), or miss-no-commit (`cache.policy == "none"` or no cache -/// key). -/// -/// # Errors -/// Returns an error if the Docker daemon `image_exists` call fails. -pub async fn decide(docker: &DockerClient, step: &CommandStep) -> Result { - let Some(tag) = cache_image_tag(step) else { - return Ok(CacheOutcome { - decision: CacheDecision::MissNoCommit, - stale_tags: vec![], - }); - }; - if docker.image_exists(&tag).await? { - Ok(CacheOutcome { - decision: CacheDecision::Hit { - tag: SnapshotRef::from(tag), - }, - stale_tags: vec![], - }) - } else { - let safe = sanitize_for_tag(&step.key); - let prefix = format!("harmont-local/{safe}"); - let stale = docker - .list_images_by_reference(&prefix) - .await - .unwrap_or_default() - .into_iter() - .filter(|t| *t != tag) - .collect(); - Ok(CacheOutcome { - decision: CacheDecision::MissBuildAs { - tag: SnapshotRef::from(tag), - }, - stale_tags: stale, - }) - } -} - // --------------------------------------------------------------------------- // COW workspace cache // --------------------------------------------------------------------------- @@ -241,30 +165,6 @@ mod tests { } } - #[test] - fn no_cache_yields_none() { - assert!(cache_image_tag(&step(None)).is_none()); - } - - #[test] - fn policy_none_yields_none() { - let s = step(Some(Cache { - policy: "none".into(), - key: Some("abcdef".into()), - })); - assert!(cache_image_tag(&s).is_none()); - } - - #[test] - fn ttl_with_key_yields_tag() { - let s = step(Some(Cache { - policy: "ttl".into(), - key: Some("0123456789abcdefffff".into()), - })); - let tag = cache_image_tag(&s).unwrap(); - assert!(tag.starts_with("harmont-local/build:")); - } - #[test] fn sanitize_replaces_invalid_chars() { assert_eq!(sanitize_for_tag("my/step.name:v1"), "my-step-name-v1"); @@ -272,10 +172,6 @@ mod tests { assert_eq!(sanitize_for_tag("a_b-c"), "a_b-c"); } - // ------------------------------------------------------------------ - // COW workspace cache tests - // ------------------------------------------------------------------ - #[test] fn cow_cache_dir_returns_path_for_cacheable_step() { let s = step(Some(Cache { diff --git a/crates/hm/src/orchestrator/scheduler.rs b/crates/hm/src/orchestrator/scheduler.rs index 4ab584e..cfbecd4 100644 --- a/crates/hm/src/orchestrator/scheduler.rs +++ b/crates/hm/src/orchestrator/scheduler.rs @@ -72,7 +72,6 @@ pub async fn run( parallelism: usize, runner_registry: Arc, renderer: Box, - cow: bool, ) -> Result { // Set up per-run state. let bus = EventBus::new(); @@ -91,17 +90,14 @@ pub async fn run( // Build the source archive once. let archive_bytes = build_archive_bytes(&repo_root).context("build source archive")?; - // When COW mode is enabled, extract the source archive into a - // temporary directory and create a workspace manager that will - // produce per-step COW clones. This must happen before - // `archives.register()` which consumes the bytes. - let workspace = if cow { - let run_dir = std::env::temp_dir().join(format!("harmont-run-{run_id}")); + // Extract the source archive into a temporary directory and create + // a workspace manager for per-step COW clones. This must happen + // before `archives.register()` which consumes the bytes. + let run_dir = std::env::temp_dir().join(format!("harmont-run-{run_id}")); + let workspace = { let mgr = super::workspace::WorkspaceManager::from_archive(run_dir, &archive_bytes) .context("init COW workspace")?; - Some(Arc::new(std::sync::Mutex::new(mgr))) - } else { - None + Arc::new(std::sync::Mutex::new(mgr)) }; let archive_id = archives.register(archive_bytes); @@ -248,9 +244,8 @@ pub async fn run( } } - // Clean up the COW workspace tree if one was created. - if let Some(ref ws) = workspace { - let mut mgr = ws + { + let mut mgr = workspace .lock() .map_err(|_| anyhow::anyhow!("workspace manager mutex poisoned"))?; if let Err(e) = mgr.cleanup() { @@ -312,27 +307,14 @@ async fn execute_step( display_name: display_name.clone(), }); - // Decide cache outcome host-side. In COW mode we check the - // workspace cache directory; otherwise we consult Docker images. - let (decision, cow_cache_to, cow_stale_dirs, docker_stale_tags) = if run_ctx.workspace.is_some() - { - let cow_outcome = cache::decide_cow(&step_wire)?; - ( - cow_outcome.decision, - cow_outcome.cache_to, - cow_outcome.stale_dirs, - vec![], - ) - } else { - let outcome = cache::decide(&run_ctx.docker, &step_wire).await?; - (outcome.decision, None, vec![], outcome.stale_tags) - }; + let cow_outcome = cache::decide_cow(&step_wire)?; + let decision = cow_outcome.decision; + let cow_cache_to = cow_outcome.cache_to; + let cow_stale_dirs = cow_outcome.stale_dirs; - // Create a COW workspace for this step when running in COW mode. - // On a COW cache hit we restore from the cached directory instead - // of cloning from the parent workspace. - if let Some(ref workspace) = run_ctx.workspace { - let mut mgr = workspace + { + let mut mgr = run_ctx + .workspace .lock() .map_err(|_| anyhow::anyhow!("workspace manager mutex poisoned"))?; if let CacheDecision::Hit { ref tag } = decision { @@ -428,31 +410,22 @@ async fn execute_step( }); cancel.cancel(); } else { - // Persist to COW cache and evict stale entries on success. - if let Some(ref ws_mgr) = run_ctx.workspace { - if let Some(ref cache_to) = cow_cache_to { - let ws_path = { - let mgr = ws_mgr - .lock() - .map_err(|_| anyhow::anyhow!("workspace manager mutex poisoned"))?; - mgr.workspace_path(&step_key) - .map(std::path::Path::to_path_buf) - }; - if let Some(ws) = ws_path - && let Err(e) = cache::persist_cow_cache(&ws, cache_to) - { - tracing::warn!(%e, "failed to persist COW cache"); - } - } - cache::evict_stale_cow_dirs(&cow_stale_dirs); - } else { - // Docker mode: evict stale Docker cache images. - for stale in &docker_stale_tags { - if let Err(e) = run_ctx.docker.remove_image(stale).await { - tracing::warn!(image = %stale, %e, "failed to evict stale cache image"); - } + if let Some(ref cache_to) = cow_cache_to { + let ws_path = { + let mgr = run_ctx + .workspace + .lock() + .map_err(|_| anyhow::anyhow!("workspace manager mutex poisoned"))?; + mgr.workspace_path(&step_key) + .map(std::path::Path::to_path_buf) + }; + if let Some(ws) = ws_path + && let Err(e) = cache::persist_cow_cache(&ws, cache_to) + { + tracing::warn!(%e, "failed to persist COW cache"); } } + cache::evict_stale_cow_dirs(&cow_stale_dirs); } Ok(StepOutcome { exit_code: sr.exit_code, diff --git a/crates/hm/src/runner/docker.rs b/crates/hm/src/runner/docker.rs index 4b5266b..68183a7 100644 --- a/crates/hm/src/runner/docker.rs +++ b/crates/hm/src/runner/docker.rs @@ -1,9 +1,9 @@ //! Docker-based step runner. //! -//! Replaces the old `hm-plugin-docker` WASM plugin with direct Bollard -//! calls. All Docker orchestration (pull, start, extract, exec, commit, -//! stop+remove) runs through [`RunContext::docker`] with cancellation -//! support via [`RunContext::cancel`]. +//! Each step runs inside a Docker container with the workspace +//! bind-mounted from the host via COW clones. System-level state +//! (installed packages) propagates via Docker image commits; workspace +//! files propagate via host-side COW directory clones. use std::future::Future; use std::pin::Pin; @@ -18,38 +18,6 @@ use uuid::Uuid; use super::{RunContext, StepRunner}; use crate::orchestrator::events::EventBus; -// --------------------------------------------------------------------------- -// EXTRACT_CMD_SH -// --------------------------------------------------------------------------- - -/// Shell script for idempotent workspace extraction. Reads a `.harmont-extracted` -/// manifest to clean up files from a previous extract, then unpacks the new -/// archive and writes a fresh manifest. Files created by the step command -/// (e.g. `node_modules`, build artifacts) are not tracked and survive untouched. -const EXTRACT_CMD_SH: &str = r#"set -e -mkdir -p "$WORKDIR" -cd "$WORKDIR" -manifest="$WORKDIR/.harmont-extracted" -if [ -f "$manifest" ]; then - # Longest paths first: removes nested entries before their parents. - sort -r "$manifest" | while IFS= read -r p; do - [ -n "$p" ] || continue - if [ -d "$p" ] && [ ! -L "$p" ]; then - rmdir "$p" 2>/dev/null || true - else - rm -f "$p" 2>/dev/null || true - fi - done - rm -f "$manifest" -fi -# Stream the archive into a temp file so we can both list and extract. -tmp=$(mktemp) -trap 'rm -f "$tmp"' EXIT -cat > "$tmp" -tar -tzf "$tmp" > "$manifest" -tar -xzf "$tmp" -"#; - // --------------------------------------------------------------------------- // DockerRunner // --------------------------------------------------------------------------- @@ -70,188 +38,15 @@ impl StepRunner for DockerRunner { input: ExecutorInput, ) -> Pin> + Send + '_>> { let ctx = ctx.clone(); - Box::pin(async move { - if ctx.workspace.is_some() { - run_step_cow(&ctx, input).await - } else { - run_step(&ctx, input).await - } - }) + Box::pin(async move { run_step(&ctx, input).await }) } } // --------------------------------------------------------------------------- -// Core orchestration +// Step execution // --------------------------------------------------------------------------- -async fn run_step(ctx: &RunContext, input: ExecutorInput) -> Result { - let plan = decision_plan(&input.cache_lookup); - - // Cache hit shortcut: no container, no exec; hand back the hit - // tag so downstream steps can boot from it. - if !plan.run_command { - return Ok(StepResult { - exit_code: 0, - committed_snapshot: plan.hit_tag.clone(), - artifacts: vec![], - }); - } - - let image = resolve_image( - &input.step, - plan.hit_tag.as_ref(), - input.parent_snapshot.as_ref(), - ); - let container_name = sanitize_container_name(&input.run_id.to_string(), &input.step.key); - let env_vec: Vec = input.env.iter().map(|(k, v)| format!("{k}={v}")).collect(); - - // Ensure the image is locally available. - if !ctx.docker.image_exists(&image).await.unwrap_or(false) { - let docker = ctx.docker.clone(); - let cancel = ctx.cancel.clone(); - let img = image.clone(); - let pull_fut = async move { docker.pull_image(&img).await }; - tokio::select! { - result = pull_fut => result.with_context(|| format!("pull '{image}'"))?, - () = cancel.cancelled() => anyhow::bail!("cancelled during image pull"), - } - } - - let cid = ctx - .docker - .start_long_lived(&image, &env_vec, &input.workdir, &container_name) - .await - .context("docker start failed")?; - - // Always stop+remove the container, even on error. - let result = run_in_container(ctx, &cid, &input, &env_vec, &plan).await; - ctx.docker.stop_remove(&cid).await; - result -} - -/// Inner body executed with a running container. Separated so the -/// caller can unconditionally clean up the container in all paths. -async fn run_in_container( - ctx: &RunContext, - cid: &str, - input: &ExecutorInput, - env_vec: &[String], - plan: &DecisionPlan, -) -> Result { - // --- Extract workspace archive --- - let archive = ctx.archives.read(input.workspace_archive_id, 0, u64::MAX); - if archive.is_empty() { - anyhow::bail!("archive {} is empty or unknown", input.workspace_archive_id); - } - - let docker = ctx.docker.clone(); - let cancel = ctx.cancel.clone(); - let cid_owned = cid.to_owned(); - let workdir = input.workdir.clone(); - let cmd = vec![ - "sh".to_string(), - "-c".to_string(), - EXTRACT_CMD_SH.replace("$WORKDIR", &workdir), - ]; - let extract_fut = async move { - let mut sink = tokio::io::sink(); - let rc = docker - .exec_streaming_stdin(&cid_owned, &cmd, &[], "/", &archive, &mut sink) - .await?; - if rc != 0 { - anyhow::bail!("tar extract exited {rc}"); - } - Ok::<(), anyhow::Error>(()) - }; - tokio::select! { - result = extract_fut => result.context("workspace extract failed")?, - () = cancel.cancelled() => anyhow::bail!("cancelled during workspace extract"), - } - - // --- Exec step command --- - let mut writer = StepLogWriter::new(input.step_id, Arc::clone(&ctx.event_bus)); - let docker = ctx.docker.clone(); - let cancel = ctx.cancel.clone(); - let cid_owned = cid.to_owned(); - let cmd = vec!["sh".into(), "-c".into(), input.step.cmd.clone()]; - let workdir = input.workdir.clone(); - let env_owned = env_vec.to_vec(); - let exec_fut = async move { - let rc = docker - .exec_streaming(&cid_owned, &cmd, &env_owned, &workdir, &mut writer) - .await?; - writer.flush_remaining(); - Ok::(rc) - }; - - let rc = tokio::select! { - result = exec_fut => result.context("docker exec failed")?, - () = cancel.cancelled() => { - return Ok(StepResult { - exit_code: 130, - committed_snapshot: None, - artifacts: vec![], - }); - } - }; - - #[allow( - clippy::cast_possible_truncation, - reason = "docker exit codes fit in i32" - )] - let exit_code = rc as i32; - - // --- Commit snapshot on success --- - let committed = if exit_code == 0 { - let target_tag = plan.commit_to.clone().unwrap_or_else(|| { - let safe: String = input - .step - .key - .chars() - .map(|c| { - if c.is_ascii_alphanumeric() || c == '_' || c == '-' { - c - } else { - '-' - } - }) - .collect(); - SnapshotRef::from(format!( - "harmont-local-ephemeral/{safe}:run-{}", - input.step_id.simple() - )) - }); - match ctx - .docker - .commit_container(cid, &target_tag.to_string()) - .await - { - Ok(_) => Some(target_tag), - Err(e) => { - tracing::warn!(step_key = %input.step.key, "docker commit failed, step still succeeded: {e:#}"); - None - } - } - } else { - None - }; - - Ok(StepResult { - exit_code, - committed_snapshot: committed, - artifacts: vec![], - }) -} - -// --------------------------------------------------------------------------- -// COW execution path -// --------------------------------------------------------------------------- - -/// Pick the base image for a COW step. -/// -/// In COW mode the workspace is bind-mounted, but we still need -/// parent snapshots for system-level state (installed packages). -fn resolve_image_cow(step: &CommandStep, input: &ExecutorInput) -> String { +fn resolve_image(step: &CommandStep, input: &ExecutorInput) -> String { if let Some(snap) = &input.parent_snapshot { return snap.to_string(); } @@ -260,7 +55,7 @@ fn resolve_image_cow(step: &CommandStep, input: &ExecutorInput) -> String { .unwrap_or_else(|| "alpine:latest".to_string()) } -async fn run_step_cow(ctx: &RunContext, input: ExecutorInput) -> Result { +async fn run_step(ctx: &RunContext, input: ExecutorInput) -> Result { let plan = decision_plan(&input.cache_lookup); if !plan.run_command { @@ -271,10 +66,7 @@ async fn run_step_cow(ctx: &RunContext, input: ExecutorInput) -> Result Result = input.env.iter().map(|(k, v)| format!("{k}={v}")).collect(); @@ -309,12 +101,12 @@ async fn run_step_cow(ctx: &RunContext, input: ExecutorInput) -> Result DecisionPlan { } } -// --------------------------------------------------------------------------- -// resolve_image -// --------------------------------------------------------------------------- - -/// Pick the base image for a step at boot time. -/// -/// Priority (high to low): -/// 1. Cache `hit_tag` — the host already located a satisfying snapshot. -/// 2. `parent_snapshot` — the previous step in this chain committed a -/// snapshot; chain-lineage requires we boot from it. -/// 3. The step's `image` field. -/// 4. Fall back to `"alpine:latest"`. -fn resolve_image( - step: &CommandStep, - hit_tag: Option<&SnapshotRef>, - parent_snapshot: Option<&SnapshotRef>, -) -> String { - if let Some(tag) = hit_tag { - return tag.to_string(); - } - if let Some(snap) = parent_snapshot { - return snap.to_string(); - } - if let Some(image) = &step.image { - return image.clone(); - } - "alpine:latest".to_string() -} - // --------------------------------------------------------------------------- // sanitize_container_name // --------------------------------------------------------------------------- @@ -586,62 +349,33 @@ mod tests { } } - // -- resolve_image_cow ---------------------------------------------------- + // -- resolve_image ---------------------------------------------------- #[test] - fn resolve_image_cow_uses_step_image() { + fn resolve_image_uses_step_image() { let s = step_with_image(Some("rust:1.82")); let input = make_input(s.clone(), None); - assert_eq!(resolve_image_cow(&s, &input), "rust:1.82"); + assert_eq!(resolve_image(&s, &input), "rust:1.82"); } #[test] - fn resolve_image_cow_fallback_alpine() { + fn resolve_image_fallback_alpine() { let s = step_with_image(None); let input = make_input(s.clone(), None); - assert_eq!(resolve_image_cow(&s, &input), "alpine:latest"); + assert_eq!(resolve_image(&s, &input), "alpine:latest"); } #[test] - fn resolve_image_cow_prefers_parent_snapshot() { + fn resolve_image_prefers_parent_snapshot() { let s = step_with_image(Some("rust:1.82")); let snap = SnapshotRef::from("harmont-local-ephemeral/base:abc123".to_string()); let input = make_input(s.clone(), Some(snap)); assert_eq!( - resolve_image_cow(&s, &input), + resolve_image(&s, &input), "harmont-local-ephemeral/base:abc123" ); } - // -- resolve_image ------------------------------------------------------- - - #[test] - fn resolve_image_hit_tag_wins() { - let s = step_with_image(Some("rust:1.82")); - let hit = SnapshotRef("cache:tag".into()); - let parent = SnapshotRef("parent:tag".into()); - assert_eq!(resolve_image(&s, Some(&hit), Some(&parent)), "cache:tag"); - } - - #[test] - fn resolve_image_parent_snapshot_beats_step_image() { - let s = step_with_image(Some("rust:1.82")); - let parent = SnapshotRef("parent:tag".into()); - assert_eq!(resolve_image(&s, None, Some(&parent)), "parent:tag"); - } - - #[test] - fn resolve_image_step_image_used() { - let s = step_with_image(Some("rust:1.82")); - assert_eq!(resolve_image(&s, None, None), "rust:1.82"); - } - - #[test] - fn resolve_image_fallback_alpine() { - let s = step_with_image(None); - assert_eq!(resolve_image(&s, None, None), "alpine:latest"); - } - // -- decision_plan ------------------------------------------------------- #[test] diff --git a/crates/hm/src/runner/mod.rs b/crates/hm/src/runner/mod.rs index 9e32b89..4944800 100644 --- a/crates/hm/src/runner/mod.rs +++ b/crates/hm/src/runner/mod.rs @@ -37,9 +37,7 @@ pub struct RunContext { pub event_bus: Arc, pub archives: Arc, pub cancel: CancellationToken, - /// When present, steps use COW workspace bind mounts instead of - /// tar.gz extraction + docker commit. - pub workspace: Option>>, + pub workspace: Arc>, } // --------------------------------------------------------------------------- From 53882d38198880c79f0e61a15a37e8aaf2ee032e Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Tue, 26 May 2026 22:37:42 -0700 Subject: [PATCH 20/25] fix: add allow_other to fuse-overlayfs mounts for Docker visibility FUSE mounts are only visible to the mounting user by default. Docker daemon runs as root so it can't bind-mount from fuse-overlayfs paths. Adding allow_other fixes this. The probe now also tests a real mount with allow_other so it falls back gracefully when user_allow_other is not enabled in /etc/fuse.conf. CI workflows updated to uncomment user_allow_other in /etc/fuse.conf. --- .github/workflows/ci.yml | 3 +++ .github/workflows/examples.yml | 3 +++ crates/hm-util/src/cow.rs | 43 ++++++++++++++++++++++++++++++++-- 3 files changed, 47 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 984239d..9474083 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -118,6 +118,9 @@ jobs: - name: Build hm run: cargo build -p harmont-cli + - name: Enable FUSE allow_other + run: sudo sed -i 's/#user_allow_other/user_allow_other/' /etc/fuse.conf + - name: Restore harmont Docker cache uses: actions/cache/restore@v4 with: diff --git a/.github/workflows/examples.yml b/.github/workflows/examples.yml index d923926..ad8eff1 100644 --- a/.github/workflows/examples.yml +++ b/.github/workflows/examples.yml @@ -109,6 +109,9 @@ jobs: - name: Mark hm executable run: chmod +x bin/hm + - name: Enable FUSE allow_other + run: sudo sed -i 's/#user_allow_other/user_allow_other/' /etc/fuse.conf + - name: Run example via hm run working-directory: examples/${{ matrix.example }} env: diff --git a/crates/hm-util/src/cow.rs b/crates/hm-util/src/cow.rs index d5e5238..97c1032 100644 --- a/crates/hm-util/src/cow.rs +++ b/crates/hm-util/src/cow.rs @@ -69,7 +69,46 @@ fn probe_reflink() -> bool { #[cfg(target_os = "linux")] fn probe_fuse_overlayfs() -> bool { - which::which("fuse-overlayfs").is_ok() + if which::which("fuse-overlayfs").is_err() { + return false; + } + let Ok(tmp) = tempfile::tempdir() else { + return false; + }; + let lower = tmp.path().join("lower"); + let upper = tmp.path().join("upper"); + let work = tmp.path().join("work"); + let merged = tmp.path().join("merged"); + for d in [&lower, &upper, &work, &merged] { + if std::fs::create_dir(d).is_err() { + return false; + } + } + let opts = format!( + "lowerdir={},upperdir={},workdir={},allow_other", + lower.display(), + upper.display(), + work.display(), + ); + let ok = Command::new("fuse-overlayfs") + .args(["-o", &opts]) + .arg(&merged) + .stderr(std::process::Stdio::null()) + .status() + .is_ok_and(|s| s.success()); + if ok { + let bin = if which::which("fusermount3").is_ok() { + "fusermount3" + } else { + "fusermount" + }; + let _ = Command::new(bin) + .args(["-u"]) + .arg(&merged) + .stderr(std::process::Stdio::null()) + .status(); + } + ok } // ----------------------------------------------------------------------- @@ -201,7 +240,7 @@ impl OverlayMount { .join(":"); let opts = format!( - "lowerdir={lowerdir},upperdir={},workdir={}", + "lowerdir={lowerdir},upperdir={},workdir={},allow_other", upper_dir.display(), work_dir.display(), ); From 1943146c52f3f482431c6ad7a34b10051916a5c7 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Tue, 26 May 2026 22:38:52 -0700 Subject: [PATCH 21/25] feat: add COW strategy diagnostics with slow-path warning diagnose_strategies() probes each COW backend and reports availability with reasons. WorkspaceManager warns at startup when falling back to FullCopy and logs which strategies were tried and why they failed. --- crates/hm-util/src/cow.rs | 56 +++++++++++++++++++++++++ crates/hm/src/orchestrator/workspace.rs | 10 +++++ 2 files changed, 66 insertions(+) diff --git a/crates/hm-util/src/cow.rs b/crates/hm-util/src/cow.rs index 97c1032..042e5d4 100644 --- a/crates/hm-util/src/cow.rs +++ b/crates/hm-util/src/cow.rs @@ -26,6 +26,62 @@ pub fn detect_strategy() -> CowStrategy { *STRATEGY.get_or_init(detect_strategy_inner) } +/// Probe result for a single strategy. +#[derive(Debug, Clone)] +pub struct StrategyProbe { + pub strategy: CowStrategy, + pub available: bool, + pub reason: &'static str, +} + +/// Test all strategies and report which are available. +/// Used for diagnostics / user-facing warnings. +#[must_use] +#[allow(clippy::vec_init_then_push)] +pub fn diagnose_strategies() -> Vec { + let mut probes = Vec::new(); + + #[cfg(target_os = "macos")] + probes.push(StrategyProbe { + strategy: CowStrategy::ApfsClone, + available: true, + reason: "macOS APFS detected", + }); + + #[cfg(target_os = "linux")] + { + probes.push(StrategyProbe { + strategy: CowStrategy::Reflink, + available: probe_reflink(), + reason: if probe_reflink() { + "filesystem supports reflinks" + } else { + "filesystem does not support reflinks (btrfs/XFS required)" + }, + }); + let fuse_ok = probe_fuse_overlayfs(); + probes.push(StrategyProbe { + strategy: CowStrategy::FuseOverlay, + available: fuse_ok, + reason: if fuse_ok { + "fuse-overlayfs mount succeeded" + } else if which::which("fuse-overlayfs").is_err() { + "fuse-overlayfs not installed" + } else { + "fuse-overlayfs mount failed (missing /dev/fuse or user_allow_other?)" + }, + }); + } + + probes.push(StrategyProbe { + strategy: CowStrategy::FullCopy, + available: true, + reason: "always available (slow)", + }); + + probes +} + #[allow(clippy::missing_const_for_fn)] fn detect_strategy_inner() -> CowStrategy { #[cfg(target_os = "macos")] diff --git a/crates/hm/src/orchestrator/workspace.rs b/crates/hm/src/orchestrator/workspace.rs index 42b88b7..a02028c 100644 --- a/crates/hm/src/orchestrator/workspace.rs +++ b/crates/hm/src/orchestrator/workspace.rs @@ -47,6 +47,16 @@ impl WorkspaceManager { .with_context(|| format!("create run dir {}", run_dir.display()))?; let strategy = hm_util::cow::detect_strategy(); tracing::info!(?strategy, "COW workspace strategy"); + if strategy == hm_util::cow::CowStrategy::FullCopy { + tracing::warn!("using full-copy fallback — workspace cloning will be slow"); + for probe in hm_util::cow::diagnose_strategies() { + if probe.available { + tracing::info!(strategy = ?probe.strategy, reason = probe.reason, "available"); + } else { + tracing::info!(strategy = ?probe.strategy, reason = probe.reason, "unavailable"); + } + } + } Ok(Self { run_dir, base_dir, From 11774e3152e7811809b798223e2b7bf0421520f2 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Tue, 26 May 2026 22:42:01 -0700 Subject: [PATCH 22/25] fix: add squash_to_uid/gid=0 to fuse-overlayfs mounts Docker containers run as root but fuse-overlayfs runs as the host user. Without UID squashing, the FUSE daemon can't create files owned by UID 0 in the upper dir, causing EPERM on mkdir inside bind-mounted overlays. --- crates/hm-util/src/cow.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/hm-util/src/cow.rs b/crates/hm-util/src/cow.rs index 042e5d4..e2bdd7f 100644 --- a/crates/hm-util/src/cow.rs +++ b/crates/hm-util/src/cow.rs @@ -141,7 +141,7 @@ fn probe_fuse_overlayfs() -> bool { } } let opts = format!( - "lowerdir={},upperdir={},workdir={},allow_other", + "lowerdir={},upperdir={},workdir={},allow_other,squash_to_uid=0,squash_to_gid=0", lower.display(), upper.display(), work.display(), @@ -296,7 +296,7 @@ impl OverlayMount { .join(":"); let opts = format!( - "lowerdir={lowerdir},upperdir={},workdir={},allow_other", + "lowerdir={lowerdir},upperdir={},workdir={},allow_other,squash_to_uid=0,squash_to_gid=0", upper_dir.display(), work_dir.display(), ); From 04076328c3e6a208c66e7f4c118707306599775f Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Wed, 27 May 2026 00:45:30 -0700 Subject: [PATCH 23/25] readme fix --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f42e471..1f80327 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@

- Website · Docs · Slack + Website · Docs · Slack

> [!WARNING] @@ -174,7 +174,7 @@ Go, Python, Java, C++, React, Next.js, and more. ## Documentation For the full pipeline reference, rich examples, and more — see the -[docs](https://harmont.dev/docs). +[docs](https://docs.harmont.dev). ## License From 0cb736c2fa101b91113e2ad2650503b958b78876 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Wed, 27 May 2026 00:47:28 -0700 Subject: [PATCH 24/25] deslop --- crates/hm/src/orchestrator/workspace.rs | 8 -------- 1 file changed, 8 deletions(-) diff --git a/crates/hm/src/orchestrator/workspace.rs b/crates/hm/src/orchestrator/workspace.rs index a02028c..8a69a52 100644 --- a/crates/hm/src/orchestrator/workspace.rs +++ b/crates/hm/src/orchestrator/workspace.rs @@ -160,10 +160,6 @@ impl WorkspaceManager { Ok(()) } - // ------------------------------------------------------------------ - // Clone strategy - // ------------------------------------------------------------------ - fn create_clone( &mut self, step_key: &str, @@ -191,10 +187,6 @@ impl WorkspaceManager { Ok(ws_dir) } - // ------------------------------------------------------------------ - // Overlay strategy - // ------------------------------------------------------------------ - fn create_overlay(&mut self, step_key: &str, parent_key: Option<&str>) -> Result { let safe = sanitize_key(step_key); let layer_dir = self.run_dir.join("layers").join(&safe); From 989baf793b8deb9349b5c49bf487c732fe760ef4 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Wed, 27 May 2026 00:48:11 -0700 Subject: [PATCH 25/25] deslop --- crates/hm-util/src/cow.rs | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/crates/hm-util/src/cow.rs b/crates/hm-util/src/cow.rs index e2bdd7f..fd0ac7b 100644 --- a/crates/hm-util/src/cow.rs +++ b/crates/hm-util/src/cow.rs @@ -6,10 +6,6 @@ use std::sync::OnceLock; use anyhow::{Context, Result, bail}; -// ----------------------------------------------------------------------- -// Strategy detection -// ----------------------------------------------------------------------- - #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum CowStrategy { ApfsClone, @@ -167,10 +163,6 @@ fn probe_fuse_overlayfs() -> bool { ok } -// ----------------------------------------------------------------------- -// cow_clone_dir -// ----------------------------------------------------------------------- - /// Clone `src` to `dst` using the best available COW mechanism. /// /// # Errors @@ -262,10 +254,6 @@ fn copy_dir_recursive(src: &Path, dst: &Path) -> Result<()> { Ok(()) } -// ----------------------------------------------------------------------- -// OverlayMount — fuse-overlayfs lifecycle (strategy 3) -// ----------------------------------------------------------------------- - pub struct OverlayMount { merged: PathBuf, upper: PathBuf,