Skip to content

[Detail Bug] Temporary file leak on tool exception in reward_config_auditing #45

@detail-app

Description

@detail-app

Summary

  • Context: The reward_config_auditing function applies patches to configuration files and re-scans them with security tools (KubeLinter or Semgrep) to verify patch effectiveness as part of the reward calculation.
  • Bug: Temporary files created during patch verification are not cleaned up if the security tool scanning raises an exception.
  • Actual vs. expected: When kubelinter_lint() or semgrep_scan() raise exceptions, the temporary file remains on disk instead of being deleted.
  • Impact: This causes a resource leak that accumulates temporary files on disk during production evaluations, especially when tools encounter malformed configurations or crash unexpectedly.

Code with bug

post: List[Violation] | None = None
if patch:
    fixture_path = (answer_obj or {}).get("fixture_path", "")
    fixture_type = (answer_obj or {}).get("fixture_type", "k8s")
    applied, new_text = try_apply_patch(fixture_path, patch)
    if applied:
        with tempfile.NamedTemporaryFile("w", delete=False) as tmp:
            tmp.write(new_text)
            tmp_path = tmp.name
        if fixture_type == "k8s":
            post_findings = kubelinter_lint([tmp_path])  # <-- BUG 🔴 If this raises, tmp_path leaks
        else:
            post_findings = semgrep_scan([tmp_path])  # <-- BUG 🔴 If this raises, tmp_path leaks
        post = normalize_findings(post_findings)
        Path(tmp_path).unlink(missing_ok=True)  # <-- Only reached if no exception above

Location: environments/sv-env-config-verification/sv_env_config_verification.py, lines 215-229 in the reward_config_auditing function.

Evidence

Failing test

Test script

"""Test to verify temporary file leak in reward_config_auditing."""

import json
import sys
import tempfile
from pathlib import Path
from unittest.mock import patch

# Add the environment directory to the path
env_dir = Path(__file__).parent / "environments" / "sv-env-config-verification"
sys.path.insert(0, str(env_dir))

# Import the function under test
from sv_env_config_verification import reward_config_auditing


def test_temp_file_leak_on_exception():
    """Test that temporary files are leaked when tool scanning raises an exception."""

    # Setup: valid completion with a patch
    completion = json.dumps({
        "violations": [{"id": "kube-linter/test", "severity": "med"}],
        "patch": "--- bad_pod.yaml\n+++ bad_pod.yaml\n@@ -1,3 +1,4 @@\n apiVersion: v1\n kind: Pod\n+# test change\n metadata:",
        "confidence": 0.9
    })

    # Setup: answer with fixture_path pointing to a real file
    test_fixture_dir = Path(__file__).parent / "environments" / "sv-env-config-verification" / "dataset" / "fixtures" / "k8s"
    fixture_path = test_fixture_dir / "bad_pod.yaml"

    answer = {
        "oracle": [{"id": "kube-linter/test", "severity": "med"}],
        "fixture_path": str(fixture_path),
        "fixture_type": "k8s"
    }

    # Count temp files before
    temp_dir = Path(tempfile.gettempdir())
    temp_files_before = set(temp_dir.glob("tmp*"))

    exception_raised = False

    # Mock kubelinter_lint to raise an exception
    with patch("sv_env_config_verification.kubelinter_lint") as mock_kubelinter:
        mock_kubelinter.side_effect = RuntimeError("kubelinter crashed!")

        # Call the function - it should raise an exception
        try:
            result = reward_config_auditing(completion, answer)
            print(f"Function returned: {result}")
        except RuntimeError as e:
            exception_raised = True
            print(f"Exception raised as expected: {e}")

    # Count temp files after
    temp_files_after = set(temp_dir.glob("tmp*"))
    leaked_files = temp_files_after - temp_files_before

    print(f"Exception raised: {exception_raised}")
    print(f"Temp files before: {len(temp_files_before)}")
    print(f"Temp files after: {len(temp_files_after)}")
    print(f"Leaked files: {list(leaked_files)}")

    # Clean up leaked files manually
    for leaked_file in leaked_files:
        try:
            print(f"Content of leaked file {leaked_file}:")
            print(leaked_file.read_text()[:200])
            leaked_file.unlink()
            print(f"✓ Cleaned up leaked file: {leaked_file}")
        except Exception as e:
            print(f"Failed to clean up {leaked_file}: {e}")

    if exception_raised and len(leaked_files) > 0:
        print("\n✓ BUG CONFIRMED: Temporary file leaked when tool raises exception")
        return True
    elif not exception_raised:
        print("\n✗ Bug not confirmed: Exception was not raised (caught somewhere?)")
        return False
    else:
        print("\n✗ Bug not confirmed: No files were leaked")
        return False


if __name__ == "__main__":
    print("Testing temporary file leak on tool exception...")
    print("=" * 70)
    success = test_temp_file_leak_on_exception()
    print("=" * 70)
    if success:
        print("Bug confirmed!")
    else:
        print("Bug not confirmed - investigating further...")

Test output

Testing temporary file leak on tool exception...
======================================================================
Exception raised as expected: kubelinter crashed!
Exception raised: True
Temp files before: 2
Temp files after: 3
Leaked files: [PosixPath('/tmp/tmpeg6bf7zt')]
Content of leaked file /tmp/tmpeg6bf7zt:
apiVersion: v1
kind: Pod
# test change
metadata:
  name: bad-pod
spec:
  containers:
    - name: web
      image: nginx:latest
      securityContext:
        runAsNonRoot: false

✓ Cleaned up leaked file: /tmp/tmpeg6bf7zt

✓ BUG CONFIRMED: Temporary file leaked when tool raises exception
======================================================================
Bug confirmed!

Example

To understand how this bug manifests in practice:

  1. A model generates a patch for a Kubernetes configuration file
  2. The reward_config_auditing function applies this patch to create a modified configuration
  3. A temporary file is created with delete=False to hold the modified configuration
  4. The function calls kubelinter_lint([tmp_path]) to scan the modified file
  5. Bug scenario: KubeLinter crashes due to malformed YAML, invalid configuration syntax, or resource exhaustion
  6. The exception propagates up, bypassing the cleanup line Path(tmp_path).unlink(missing_ok=True)
  7. The temporary file remains on disk permanently

Over thousands of evaluation runs with various models, this accumulates leaked temporary files in /tmp/.

Full context

The reward_config_auditing function is the core reward calculation for the E2 (config-verification) environment. It's called by the Verifiers framework during environment evaluation to compute rewards for model responses. The function:

  1. Parses model output containing predicted violations and an optional patch
  2. Compares predicted violations against a ground-truth oracle
  3. If a patch is provided, applies it to the original fixture file
  4. Creates a temporary file with the patched content
  5. Re-runs security tools (KubeLinter for k8s, Semgrep for Terraform) on the patched file
  6. Calculates a reward based on detection accuracy and patch effectiveness

This function is executed repeatedly during:

  • Offline evaluations via scripts/eval_config_verification.py (potentially thousands of times)
  • Online RL training loops where the environment is stepped repeatedly
  • Unit tests that verify reward calculation

The security tool adapters (kubelinter_lint and semgrep_scan) execute external binaries via subprocess.run(). These can fail for various reasons:

  • Malformed configuration syntax in the patched file
  • Tool binary not found or permission issues
  • Memory exhaustion or timeout
  • Invalid command-line arguments from corrupted state

The adapters in adapters/kubelinter_adapter.py and adapters/semgrep_adapter.py raise KubeLinterError and SemgrepError exceptions on failures. These exceptions are not caught in reward_config_auditing, so they propagate to the caller.

Why has this bug gone undetected?

This bug has gone undetected for several reasons:

  1. Most evaluations succeed: In typical scenarios with well-formed configurations, the security tools run successfully without raising exceptions, so the cleanup code executes normally.

  2. Exceptions are rare in test fixtures: The built-in test fixtures (bad_pod.yaml, bad.tf) are carefully crafted to be parseable by the tools, so tests don't trigger tool failures.

  3. Temporary file cleanup is OS-managed: On most Linux systems, /tmp/ is cleared on reboot or by periodic cleanup jobs (like tmpwatch or systemd-tmpfiles), so leaked files don't accumulate indefinitely in development environments.

  4. Exception propagation is desired: When tools fail, the exception propagating to the caller is actually correct behavior - the reward cannot be calculated without tool output. The tests verify this with mocked tool failures, but they don't check for resource leaks.

  5. Limited production deployment: Since E2 is currently in development (not production-ready according to CLAUDE.md), large-scale evaluations that would expose this leak haven't been run yet.

  6. No monitoring for temp files: Neither the evaluation scripts nor the environment itself monitor temporary file creation/cleanup, so the leak is invisible unless explicitly checked.

Recommended fix

Wrap the tool scanning and normalization in a try-finally block to ensure cleanup:

post: List[Violation] | None = None
if patch:
    fixture_path = (answer_obj or {}).get("fixture_path", "")
    fixture_type = (answer_obj or {}).get("fixture_type", "k8s")
    applied, new_text = try_apply_patch(fixture_path, patch)
    if applied:
        with tempfile.NamedTemporaryFile("w", delete=False) as tmp:
            tmp.write(new_text)
            tmp_path = tmp.name
        try:  # <-- FIX 🟢 Ensure cleanup even on exception
            if fixture_type == "k8s":
                post_findings = kubelinter_lint([tmp_path])
            else:
                post_findings = semgrep_scan([tmp_path])
            post = normalize_findings(post_findings)
        finally:  # <-- FIX 🟢 Always cleanup
            Path(tmp_path).unlink(missing_ok=True)

return final_reward(violations_pred, oracle, post_patch=post)

Related bugs

The same bug exists in environments/sv-env-config-verification/env.py in the ConfigAuditingEnv.step() method (lines 37-52):

def step(self, model_json: Dict[str, Any]) -> Dict[str, Any]:
    """Step the environment."""
    violations_pred, patch, _confidence = parse_model_output(model_json)
    post: List[Violation] | None = None
    if patch:
        applied, new_text = try_apply_patch(self.fixture_path, patch)
        if applied:
            with tempfile.NamedTemporaryFile("w", delete=False) as tmp:
                tmp.write(new_text)
                tmp_path = tmp.name
            if self.fixture_type == "k8s":
                post_findings = kubelinter_lint([tmp_path])  # <-- Same bug
            else:
                post_findings = semgrep_scan([tmp_path])  # <-- Same bug
            post = normalize_findings(post_findings)
            Path(tmp_path).unlink(missing_ok=True)  # <-- Never reached on exception

Both occurrences should be fixed with the same try-finally pattern.

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't workingdetail

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions