From 4942e2a86779f3464291ec5cb85db53cd885ad10 Mon Sep 17 00:00:00 2001 From: Jaime Pillora Date: Fri, 1 May 2026 08:10:33 +1000 Subject: [PATCH] Fix race in tunnel waitGroup causing negative counter panic Add() bumped its atomic counter before calling inner.Add(), so a concurrent DoneAll() could observe n>0 and call inner.Done() before inner.Add() ran, panicking with "sync: negative WaitGroup counter". Serialize Add/Done/DoneAll under a mutex; Wait stays outside to avoid deadlock. Fixes #585 Co-Authored-By: Claude Opus 4.7 (1M context) --- share/tunnel/wg.go | 24 ++++++++++------ share/tunnel/wg_test.go | 63 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+), 9 deletions(-) create mode 100644 share/tunnel/wg_test.go diff --git a/share/tunnel/wg.go b/share/tunnel/wg.go index e915ad0a..e3a3d8eb 100644 --- a/share/tunnel/wg.go +++ b/share/tunnel/wg.go @@ -1,30 +1,36 @@ package tunnel -import ( - "sync" - "sync/atomic" -) +import "sync" type waitGroup struct { + mu sync.Mutex inner sync.WaitGroup - n int32 + n int } func (w *waitGroup) Add(n int) { - atomic.AddInt32(&w.n, int32(n)) + w.mu.Lock() + w.n += n w.inner.Add(n) + w.mu.Unlock() } func (w *waitGroup) Done() { - if n := atomic.LoadInt32(&w.n); n > 0 && atomic.CompareAndSwapInt32(&w.n, n, n-1) { + w.mu.Lock() + if w.n > 0 { + w.n-- w.inner.Done() } + w.mu.Unlock() } func (w *waitGroup) DoneAll() { - for atomic.LoadInt32(&w.n) > 0 { - w.Done() + w.mu.Lock() + for w.n > 0 { + w.n-- + w.inner.Done() } + w.mu.Unlock() } func (w *waitGroup) Wait() { diff --git a/share/tunnel/wg_test.go b/share/tunnel/wg_test.go new file mode 100644 index 00000000..01e0b844 --- /dev/null +++ b/share/tunnel/wg_test.go @@ -0,0 +1,63 @@ +package tunnel + +import ( + "sync" + "testing" + "time" +) + +// TestWaitGroupAddDoneAllRace stresses concurrent Add/DoneAll to reproduce the +// race fixed in #585. Without the mutex, Add() is non-atomic between bumping +// the counter and the inner sync.WaitGroup, so DoneAll()'s loop can call +// inner.Done() before inner.Add() runs and panic with "negative WaitGroup +// counter". +func TestWaitGroupAddDoneAllRace(t *testing.T) { + var wg waitGroup + stop := make(chan struct{}) + var workers sync.WaitGroup + for i := 0; i < 8; i++ { + workers.Add(2) + go func() { + defer workers.Done() + for { + select { + case <-stop: + return + default: + wg.Add(1) + } + } + }() + go func() { + defer workers.Done() + for { + select { + case <-stop: + return + default: + wg.DoneAll() + } + } + }() + } + time.Sleep(500 * time.Millisecond) + close(stop) + workers.Wait() + wg.DoneAll() + wg.Wait() +} + +func TestWaitGroupDoneAllDrains(t *testing.T) { + var wg waitGroup + wg.Add(3) + wg.DoneAll() + wg.Wait() +} + +func TestWaitGroupDoneIsIdempotent(t *testing.T) { + var wg waitGroup + wg.Add(1) + wg.Done() + wg.Done() // extra Done must be a no-op, not a panic + wg.Wait() +}