Skip to content

Commit 335b313

Browse files
authored
Improve Windows process termination and enhance service lifecycle management (#52)
1 parent 680342a commit 335b313

File tree

3 files changed

+115
-61
lines changed

3 files changed

+115
-61
lines changed

cli/docs/dev/process-isolation.md

Lines changed: 61 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -1,136 +1,137 @@
1-
# process isolation implementation
1+
git fetch --all
2+
git rebase # Process Isolation Implementation
23

3-
## overview
4+
## Overview
45

5-
fixed two critical issues with `azd app run`:
6-
1. **windows process shutdown errors**: "graceful shutdown signal failed, forcing kill" error="invalid argument" / "not supported by windows"
7-
2. **service crash propagation**: individual service crashes were terminating the entire `azd app run` process and all other services
6+
Fixed two critical issues with `azd app run`:
7+
1. **Windows process shutdown errors**: "graceful shutdown signal failed, forcing kill" error="invalid argument" / "not supported by windows"
8+
2. **Service crash propagation**: individual service crashes were terminating the entire `azd app run` process and all other services
89

9-
## changes made
10+
## Changes Made
1011

11-
### 1. windows process termination fix
12+
### 1. Windows Process Termination Fix
1213
**file**: `cli/src/internal/service/executor.go`
1314

14-
**problem**: windows doesn't support `os.interrupt` signal properly, causing "not supported by windows" errors
15+
**problem**: Windows doesn't support `os.Interrupt` signal properly, causing "not supported by windows" errors
1516

1617
**solution**: platform-specific process termination
17-
- unix/linux/macos: use graceful `sigint` shutdown with timeout
18-
- windows: use direct `kill()` instead of signals
18+
- Unix/Linux/macOS: use graceful `SIGINT` shutdown with timeout
19+
- Windows: use direct `Kill()` instead of signals
1920

2021
```go
21-
// windows doesn't support os.interrupt properly, so kill directly
22-
if runtime.goos == "windows" {
23-
return process.process.kill()
22+
// Windows doesn't support os.Interrupt properly, so kill directly
23+
if runtime.GOOS == "windows" {
24+
return process.Process.Kill()
2425
}
25-
// unix/linux/macos can gracefully handle interrupt signal
26-
if err := process.process.signal(os.interrupt); err != nil {
27-
return fmt.errorf("failed to send interrupt signal: %w", err)
26+
// Unix/Linux/macOS can gracefully handle interrupt signal
27+
if err := process.Process.Signal(os.Interrupt); err != nil {
28+
return fmt.Errorf("failed to send interrupt signal: %w", err)
2829
}
2930
```
3031

3132
**test coverage**:
32-
- `testservicegraceful_windows()` - validates no "not supported" errors on windows
33-
- `testservicegraceful_success()` - validates graceful shutdown with timeout (all platforms)
34-
- `testservicegraceful_forcedkillaftertimeout()` - validates timeout behavior
33+
- `TestStopServiceGraceful_Windows()` - validates no "not supported" errors on Windows
34+
- `TestStopServiceGraceful_Success()` - validates graceful shutdown with timeout (all platforms)
35+
- `TestStopServiceGraceful_ForcedKillAfterTimeout()` - validates timeout behavior
3536

36-
### 2. process isolation implementation
37+
### 2. Process Isolation Implementation
3738
**file**: `cli/src/cmd/app/commands/run.go`
3839

39-
**problem**: `errgroup.group` causes fail-fast behavior - first service error cancels all services
40+
**problem**: `errgroup.Group` causes fail-fast behavior - first service error cancels all services
4041

41-
**solution**: replaced `errgroup` with `sync.waitgroup` for true process isolation
42+
**solution**: replaced `errgroup` with `sync.WaitGroup` for true process isolation
4243

4344
**architecture**:
4445
```
45-
monitorservicesuntilshutdown()
46-
├── sync.waitgroup - coordinates independent goroutines
47-
├── context with signal.notifycontext - handles ctrl+c
46+
monitorServicesUntilShutdown()
47+
├── sync.WaitGroup - coordinates independent goroutines
48+
├── context with signal.NotifyContext - handles ctrl+c
4849
└── for each service:
4950
└── goroutine with:
5051
├── defer/recover - catches panics
51-
├── monitorserviceprocess() - monitors one service
52+
├── monitorServiceProcess() - monitors one service
5253
└── no error propagation - crashes logged but don't affect others
5354
```
5455

5556
**key functions**:
5657

57-
1. **monitorservicesuntilshutdown()** - orchestrates all service monitoring
58-
- uses `sync.waitgroup` instead of `errgroup`
59-
- creates context with signal.notifycontext for ctrl+c handling
58+
1. **monitorServicesUntilShutdown()** - orchestrates all service monitoring
59+
- uses `sync.WaitGroup` instead of `errgroup`
60+
- creates context with signal.NotifyContext for ctrl+c handling
6061
- spawns independent goroutine per service
6162
- waits for shutdown signal, not for processes to exit
6263

63-
2. **monitorserviceprocess()** - monitors individual service (extracted helper)
64+
2. **monitorServiceProcess()** - monitors individual service (extracted helper)
6465
- handles service output streaming
6566
- logs service exit (clean or crash)
6667
- doesn't propagate errors or cancel context
6768
- enables unit testing of monitoring logic
6869

69-
3. **shutdownallservices()** - gracefully stops all services
70+
3. **shutdownAllServices()** - gracefully stops all services
7071
- enhanced documentation
7172
- concurrent shutdown using goroutines
7273
- waits for all to complete
7374
- aggregates errors but doesn't fail-fast
7475

7576
**panic recovery**:
7677
```go
77-
go func(name string, process *service.serviceprocess) {
78-
defer wg.done()
78+
go func(name string, process *service.ServiceProcess) {
79+
defer wg.Done()
7980
defer func() {
8081
if r := recover(); r != nil {
81-
slog.error("panic in service monitor", "service", name, "panic", r)
82+
slog.Error("panic in service monitor", "service", name, "panic", r)
8283
}
8384
}()
84-
monitorserviceprocess(ctx, name, process)
85+
monitorServiceProcess(ctx, name, process)
8586
}(name, process)
8687
```
8788

88-
### 3. comprehensive test coverage
89+
### 3. Comprehensive Test Coverage
8990
**file**: `cli/src/cmd/app/commands/run_test.go`
9091

9192
**new tests**:
92-
1. `testmonitorserviceprocess_cleanexit()` - validates clean service exit handling
93-
2. `testmonitorserviceprocess_crashexit()` - validates crash doesn't propagate
94-
3. `testmonitorserviceprocess_contextcancellation()` - validates ctrl+c handling
93+
1. `TestMonitorServiceProcess_CleanExit()` - validates clean service exit handling
94+
2. `TestMonitorServiceProcess_CrashExit()` - validates crash doesn't propagate
95+
3. `TestMonitorServiceProcess_ContextCancellation()` - validates ctrl+c handling
9596

9697
**updated tests**:
97-
- `testprocessexit_doesnotstopotherservices()` - validates full isolation
98-
- `testmonitorservicesuntilshutdown_startuptimeout()` - updated for new behavior
99-
- `testmonitorservicesuntilshutdown_multipleservices()` - fixed cleanup timing
98+
- `TestProcessExit_DoesNotStopOtherServices()` - validates full isolation
99+
- `TestMonitorServicesUntilShutdown_StartupTimeout()` - updated for new behavior
100+
- `TestMonitorServicesUntilShutdown_MultipleServices()` - fixed cleanup timing
100101

101102
**coverage**: 41.3% of commands package statements
102103

103-
### 4. flaky test fixes
104+
### 4. Flaky Test Fixes
104105
**file**: `cli/src/internal/service/graceful_shutdown_test.go`
105106

106-
**problem**: `teststopservicegraceful_success` occasionally failed with "terminateprocess: access is denied"
107+
**problem**: `TestStopServiceGraceful_Success` occasionally failed with "TerminateProcess: Access is denied"
107108

108-
**solution**: accept "access is denied" as valid (process already exited)
109+
**solution**: accept "Access is denied" as valid (process already exited)
109110
```go
110-
if err != nil && !strings.contains(err.error(), "access is denied") {
111-
t.errorf("stopservicegraceful() error = %v, want nil or 'access is denied'", err)
111+
if err != nil && !strings.Contains(err.Error(), "Access is denied") {
112+
t.Errorf("StopServiceGraceful() error = %v, want nil or 'Access is denied'", err)
112113
}
113114
```
114115

115116
**validation**: ran test 5 times consecutively - all passed
116117

117-
## behavioral changes
118+
## Behavioral Changes
118119

119-
### before
120+
### Before
120121
- any service crash → entire `azd app run` terminates
121122
- all services stop when one service exits
122123
- dashboard stops when any service fails
123124
- windows processes fail to shutdown cleanly
124125
- error: "graceful shutdown signal failed, forcing kill"
125126

126-
### after
127+
### After
127128
- service crashes are isolated → other services continue running
128129
- monitoring waits for ctrl+c, not for process exit
129130
- dashboard continues running even if all services crash
130131
- windows processes shutdown cleanly without signal errors
131132
- user must press ctrl+c to stop all services
132133

133-
### user experience
134+
### User Experience
134135
**before**:
135136
```
136137
starting service api...
@@ -153,11 +154,11 @@ starting service web...
153154
<user can fix api and restart it independently>
154155
```
155156

156-
## implementation patterns
157+
## Implementation Patterns
157158

158-
### idiomatic go
159+
### Idiomatic Go
159160
this implementation follows standard go patterns used in:
160-
- `net/http.server` - uses sync.waitgroup for connection handling
161+
- `net/http.Server` - uses sync.WaitGroup for connection handling
161162
- `database/sql` - connection pools with independent goroutines
162163
- `sync` package - canonical pattern for independent concurrent tasks
163164

@@ -167,34 +168,34 @@ this implementation follows standard go patterns used in:
167168
- parallel computations where all must succeed
168169
- examples: batch processing, multi-source data fetching
169170

170-
**sync.waitgroup** is designed for:
171+
**sync.WaitGroup** is designed for:
171172
- independent concurrent tasks
172173
- service orchestration where isolation is needed
173174
- examples: http servers, database connection pools, our use case
174175

175-
## verification
176+
## Verification
176177

177-
### test results
178+
### Test Results
178179
```
179180
commands package: pass (42.793s, 41.3% coverage)
180181
service package: pass (16.821s)
181182
build: success
182183
```
183184

184-
### manual validation
185+
### Manual Validation
185186
1. windows shutdown: no more "invalid argument" errors
186187
2. process isolation: one service crash doesn't affect others
187188
3. ctrl+c handling: clean shutdown of all services
188189
4. dashboard persistence: continues running through service crashes
189190

190-
## files modified
191+
## Files Modified
191192
1. `cli/src/internal/service/executor.go` - windows process termination
192193
2. `cli/src/internal/service/executor_test.go` - windows termination test
193194
3. `cli/src/cmd/app/commands/run.go` - process isolation architecture
194195
4. `cli/src/cmd/app/commands/run_test.go` - comprehensive test coverage
195196
5. `cli/src/internal/service/graceful_shutdown_test.go` - flaky test fix
196197

197-
## next steps (optional)
198+
## Next Steps (Optional)
198199
1. integration tests for multi-service scenarios
199200
2. performance benchmarks for large service counts
200201
3. service restart logic (currently requires manual restart)

cli/src/cmd/app/commands/run_test.go

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,24 @@ func TestMonitorServicesUntilShutdown_StartupTimeout(t *testing.T) {
267267
done <- monitorServicesUntilShutdown(result, tmpDir)
268268
}()
269269

270+
// Ensure cleanup if test exits early
271+
defer func() {
272+
// If monitoring is still running, send interrupt to clean up
273+
select {
274+
case <-done:
275+
// Already completed
276+
default:
277+
// Send interrupt to terminate monitoring goroutine
278+
proc, _ := os.FindProcess(os.Getpid())
279+
_ = proc.Signal(os.Interrupt)
280+
// Wait briefly for cleanup
281+
select {
282+
case <-done:
283+
case <-time.After(500 * time.Millisecond):
284+
}
285+
}
286+
}()
287+
270288
// Monitoring should continue even after service exits (process isolation)
271289
// Force completion after 2 seconds by stopping the process
272290
select {
@@ -828,6 +846,24 @@ func TestProcessExit_DoesNotStopOtherServices(t *testing.T) {
828846
done <- monitorServicesUntilShutdown(result, tmpDir)
829847
}()
830848

849+
// Ensure cleanup if test exits
850+
defer func() {
851+
// If monitoring is still running, send interrupt to clean up
852+
select {
853+
case <-done:
854+
// Already completed
855+
default:
856+
// Send interrupt to terminate monitoring goroutine cleanly
857+
proc, _ := os.FindProcess(os.Getpid())
858+
_ = proc.Signal(os.Interrupt)
859+
// Wait briefly for cleanup to complete
860+
select {
861+
case <-done:
862+
case <-time.After(500 * time.Millisecond):
863+
}
864+
}
865+
}()
866+
831867
// Wait to verify monitoring continues after quick-exit stops (~1s)
832868
// If process isolation works, monitoring should still be running after 3 seconds
833869
time.Sleep(3 * time.Second)

cli/src/internal/service/executor.go

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,9 +125,13 @@ func StopServiceGraceful(process *ServiceProcess, timeout time.Duration) error {
125125
}
126126
// Wait for process to exit
127127
_, waitErr := process.Process.Wait()
128+
// Ignore "Access is denied" - process already exited on Windows
129+
if waitErr != nil && !isAccessDeniedError(waitErr) {
130+
return waitErr
131+
}
128132
slog.Info("service stopped",
129133
slog.String("service", process.Name))
130-
return waitErr
134+
return nil
131135
}
132136

133137
// On Unix/Linux/macOS, try graceful shutdown with SIGINT first
@@ -256,3 +260,16 @@ func collectStreamLogs(reader io.ReadCloser, serviceName string, buffer *LogBuff
256260
buffer.Add(entry)
257261
}
258262
}
263+
264+
// isAccessDeniedError checks if an error is a Windows "Access is denied" error.
265+
// This error occurs when Wait() is called after Kill() on Windows because the
266+
// process has already exited and its handle is no longer valid.
267+
func isAccessDeniedError(err error) bool {
268+
if err == nil {
269+
return false
270+
}
271+
// Check for Windows-specific "Access is denied" error message
272+
return runtime.GOOS == "windows" &&
273+
(err.Error() == "Access is denied" ||
274+
err.Error() == "TerminateProcess: Access is denied.")
275+
}

0 commit comments

Comments
 (0)