From 5c08b050f01ed56f9a7eb0924f5824c717f97917 Mon Sep 17 00:00:00 2001 From: Alex Luong Date: Fri, 7 Nov 2025 20:07:31 +0700 Subject: [PATCH 01/19] feat: worker interface --- internal/worker/health.go | 102 ++++++ internal/worker/logger_test.go | 23 ++ internal/worker/registry.go | 164 +++++++++ internal/worker/worker.go | 20 + internal/worker/worker_test.go | 646 +++++++++++++++++++++++++++++++++ 5 files changed, 955 insertions(+) create mode 100644 internal/worker/health.go create mode 100644 internal/worker/logger_test.go create mode 100644 internal/worker/registry.go create mode 100644 internal/worker/worker.go create mode 100644 internal/worker/worker_test.go diff --git a/internal/worker/health.go b/internal/worker/health.go new file mode 100644 index 00000000..9ce71f53 --- /dev/null +++ b/internal/worker/health.go @@ -0,0 +1,102 @@ +package worker + +import ( + "sync" + "time" +) + +const ( + WorkerStatusHealthy = "healthy" + WorkerStatusFailed = "failed" +) + +// WorkerHealth represents the health status of a single worker. +// Error details are NOT exposed for security reasons. +type WorkerHealth struct { + Name string `json:"name"` + Status string `json:"status"` // "healthy" or "failed" + LastCheck time.Time `json:"last_check"` +} + +// HealthTracker tracks the health status of all workers. +// It is safe for concurrent use. +type HealthTracker struct { + mu sync.RWMutex + workers map[string]WorkerHealth +} + +// NewHealthTracker creates a new HealthTracker. +func NewHealthTracker() *HealthTracker { + return &HealthTracker{ + workers: make(map[string]WorkerHealth), + } +} + +// MarkHealthy marks a worker as healthy. +func (h *HealthTracker) MarkHealthy(name string) { + h.mu.Lock() + defer h.mu.Unlock() + + h.workers[name] = WorkerHealth{ + Name: name, + Status: WorkerStatusHealthy, + LastCheck: time.Now(), + } +} + +// MarkFailed marks a worker as failed. +// Note: Error details are NOT stored for security reasons. +func (h *HealthTracker) MarkFailed(name string) { + h.mu.Lock() + defer h.mu.Unlock() + + h.workers[name] = WorkerHealth{ + Name: name, + Status: WorkerStatusFailed, + LastCheck: time.Now(), + } +} + +// IsHealthy returns true if all workers are healthy. +func (h *HealthTracker) IsHealthy() bool { + h.mu.RLock() + defer h.mu.RUnlock() + + for _, w := range h.workers { + if w.Status != WorkerStatusHealthy { + return false + } + } + return true +} + +// GetStatus returns the overall health status with details of all workers. +func (h *HealthTracker) GetStatus() map[string]interface{} { + h.mu.RLock() + defer h.mu.RUnlock() + + workers := make(map[string]WorkerHealth) + for name, w := range h.workers { + workers[name] = w + } + + status := "healthy" + if !h.isHealthyLocked() { + status = "failed" + } + + return map[string]interface{}{ + "status": status, + "workers": workers, + } +} + +// isHealthyLocked checks health without acquiring lock (caller must hold read lock) +func (h *HealthTracker) isHealthyLocked() bool { + for _, w := range h.workers { + if w.Status != WorkerStatusHealthy { + return false + } + } + return true +} diff --git a/internal/worker/logger_test.go b/internal/worker/logger_test.go new file mode 100644 index 00000000..55c761ad --- /dev/null +++ b/internal/worker/logger_test.go @@ -0,0 +1,23 @@ +package worker_test + +import ( + "testing" + + "github.com/hookdeck/outpost/internal/util/testutil" + "github.com/hookdeck/outpost/internal/worker" +) + +// TestLoggingLoggerImplementsInterface verifies that *logging.Logger +// from internal/logging satisfies the worker.Logger interface. +func TestLoggingLoggerImplementsInterface(t *testing.T) { + logger := testutil.CreateTestLogger(t) + + // This will fail to compile if *logging.Logger doesn't implement worker.Logger + var _ worker.Logger = logger + + // Also verify we can actually use it with WorkerRegistry + registry := worker.NewWorkerRegistry(logger) + if registry == nil { + t.Fatal("expected non-nil registry") + } +} diff --git a/internal/worker/registry.go b/internal/worker/registry.go new file mode 100644 index 00000000..511d95bd --- /dev/null +++ b/internal/worker/registry.go @@ -0,0 +1,164 @@ +package worker + +import ( + "context" + "errors" + "fmt" + "sync" + "time" + + "go.uber.org/zap" +) + +// Logger is a minimal logging interface for structured logging with zap. +type Logger interface { + Info(msg string, fields ...zap.Field) + Error(msg string, fields ...zap.Field) + Debug(msg string, fields ...zap.Field) + Warn(msg string, fields ...zap.Field) +} + +// WorkerRegistry manages and supervises multiple workers. +// It tracks their health and handles graceful shutdown. +type WorkerRegistry struct { + workers map[string]Worker + health *HealthTracker + logger Logger + shutdownTimeout time.Duration // 0 means no timeout +} + +// RegistryOption configures a WorkerRegistry. +type RegistryOption func(*WorkerRegistry) + +// WithShutdownTimeout sets the maximum time to wait for workers to shutdown gracefully. +// After this timeout, Run() will return even if workers haven't finished. +// Default is 0 (no timeout - wait indefinitely). +func WithShutdownTimeout(timeout time.Duration) RegistryOption { + return func(r *WorkerRegistry) { + r.shutdownTimeout = timeout + } +} + +// NewWorkerRegistry creates a new WorkerRegistry. +func NewWorkerRegistry(logger Logger, opts ...RegistryOption) *WorkerRegistry { + r := &WorkerRegistry{ + workers: make(map[string]Worker), + health: NewHealthTracker(), + logger: logger, + shutdownTimeout: 0, // Default: no timeout + } + + for _, opt := range opts { + opt(r) + } + + return r +} + +// Register adds a worker to the registry. +// Panics if a worker with the same name is already registered. +func (r *WorkerRegistry) Register(w Worker) { + if _, exists := r.workers[w.Name()]; exists { + panic(fmt.Sprintf("worker %s already registered", w.Name())) + } + r.workers[w.Name()] = w + r.logger.Debug("worker registered", zap.String("worker", w.Name())) +} + +// GetHealthTracker returns the health tracker for this registry. +func (r *WorkerRegistry) GetHealthTracker() *HealthTracker { + return r.health +} + +// Run starts all registered workers and supervises them. +// It blocks until: +// - ALL workers have exited (either successfully or with errors), OR +// - The context is cancelled (SIGTERM/SIGINT) +// +// When a worker fails, it marks the worker as failed but does NOT +// terminate other workers. This allows: +// - Other workers to continue serving (e.g., HTTP server stays up) +// - Health checks to report the failed worker status +// - Orchestrator to detect failure and restart the pod/container +// +// Returns nil if context was cancelled and workers shutdown gracefully. +// Returns error if workers failed to shutdown within timeout (if configured). +func (r *WorkerRegistry) Run(ctx context.Context) error { + if len(r.workers) == 0 { + r.logger.Warn("no workers registered") + return nil + } + + r.logger.Info("starting workers", zap.Int("count", len(r.workers))) + + // WaitGroup to track worker goroutines + var wg sync.WaitGroup + + // Start all workers + for name, worker := range r.workers { + wg.Add(1) + go func(name string, w Worker) { + defer wg.Done() + + r.logger.Info("worker starting", zap.String("worker", name)) + + // Run the worker + if err := w.Run(ctx); err != nil && !errors.Is(err, context.Canceled) { + r.logger.Error("worker failed", + zap.String("worker", name), + zap.Error(err)) + r.health.MarkFailed(name) + } else { + r.logger.Info("worker stopped gracefully", zap.String("worker", name)) + r.health.MarkHealthy(name) + } + }(name, worker) + } + + // Wait for either: + // - All workers to exit (wg.Wait completes) + // - Context cancellation (graceful shutdown requested) + select { + case <-ctx.Done(): + r.logger.Info("context cancelled, shutting down workers") + + // Wait for all workers to finish gracefully, with optional timeout + if r.shutdownTimeout > 0 { + return r.waitWithTimeout(&wg, r.shutdownTimeout) + } + + // No timeout - wait indefinitely + wg.Wait() + return nil + case <-r.waitForWorkers(&wg): + // All workers exited (either successfully or with errors) + r.logger.Warn("all workers have exited") + return nil + } +} + +// waitForWorkers converts WaitGroup.Wait() into a channel that can be used in select. +// Returns a channel that closes when all workers have exited. +func (r *WorkerRegistry) waitForWorkers(wg *sync.WaitGroup) <-chan struct{} { + done := make(chan struct{}) + go func() { + wg.Wait() + close(done) + }() + return done +} + +// waitWithTimeout waits for the WaitGroup with a timeout. +// Returns nil if all workers finish within timeout. +// Returns error if timeout is exceeded. +func (r *WorkerRegistry) waitWithTimeout(wg *sync.WaitGroup, timeout time.Duration) error { + select { + case <-r.waitForWorkers(wg): + r.logger.Info("all workers shutdown gracefully") + return nil + case <-time.After(timeout): + r.logger.Warn("shutdown timeout exceeded, some workers may still be running", + zap.Duration("timeout", timeout)) + return fmt.Errorf("shutdown timeout exceeded (%v)", timeout) + } +} diff --git a/internal/worker/worker.go b/internal/worker/worker.go new file mode 100644 index 00000000..538b4a4e --- /dev/null +++ b/internal/worker/worker.go @@ -0,0 +1,20 @@ +package worker + +import "context" + +// Worker represents a long-running background process. +// Each worker runs in its own goroutine and can be monitored for health. +// +// Workers should: +// - Block in Run() until context is cancelled or a fatal error occurs +// - Return nil or context.Canceled for graceful shutdown +// - Return non-nil error only for fatal failures +type Worker interface { + // Name returns a unique identifier for this worker (e.g., "http-server", "retry-scheduler") + Name() string + + // Run executes the worker and blocks until context is cancelled or error occurs. + // Returns nil or context.Canceled for graceful shutdown. + // Returns error for fatal failures. + Run(ctx context.Context) error +} diff --git a/internal/worker/worker_test.go b/internal/worker/worker_test.go new file mode 100644 index 00000000..a1c783de --- /dev/null +++ b/internal/worker/worker_test.go @@ -0,0 +1,646 @@ +package worker + +import ( + "context" + "errors" + "fmt" + "sync" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/zap" +) + +// Mock worker for testing +type mockWorker struct { + name string + runFunc func(ctx context.Context) error + mu sync.Mutex + started bool +} + +func newMockWorker(name string, runFunc func(ctx context.Context) error) *mockWorker { + return &mockWorker{ + name: name, + runFunc: runFunc, + } +} + +func (m *mockWorker) Name() string { + return m.name +} + +func (m *mockWorker) Run(ctx context.Context) error { + m.mu.Lock() + m.started = true + m.mu.Unlock() + + if m.runFunc != nil { + return m.runFunc(ctx) + } + <-ctx.Done() + return nil +} + +func (m *mockWorker) WasStarted() bool { + m.mu.Lock() + defer m.mu.Unlock() + return m.started +} + +// Mock logger for testing +type mockLogger struct { + mu sync.Mutex + messages []string +} + +func newMockLogger() *mockLogger { + return &mockLogger{ + messages: []string{}, + } +} + +func (l *mockLogger) log(level, msg string, fields ...zap.Field) { + l.mu.Lock() + defer l.mu.Unlock() + l.messages = append(l.messages, fmt.Sprintf("[%s] %s", level, msg)) +} + +func (l *mockLogger) Info(msg string, fields ...zap.Field) { + l.log("INFO", msg, fields...) +} + +func (l *mockLogger) Error(msg string, fields ...zap.Field) { + l.log("ERROR", msg, fields...) +} + +func (l *mockLogger) Debug(msg string, fields ...zap.Field) { + l.log("DEBUG", msg, fields...) +} + +func (l *mockLogger) Warn(msg string, fields ...zap.Field) { + l.log("WARN", msg, fields...) +} + +func (l *mockLogger) Contains(substr string) bool { + l.mu.Lock() + defer l.mu.Unlock() + for _, msg := range l.messages { + if contains(msg, substr) { + return true + } + } + return false +} + +func contains(s, substr string) bool { + return len(s) >= len(substr) && (s == substr || len(s) > len(substr) && (s[:len(substr)] == substr || s[len(s)-len(substr):] == substr || s[1:len(s)-1] != s[1:len(s)-1] && contains(s[1:], substr))) +} + +// Tests + +func TestHealthTracker_MarkHealthy(t *testing.T) { + t.Parallel() + + tracker := NewHealthTracker() + + tracker.MarkHealthy("worker-1") + + status := tracker.GetStatus() + assert.Equal(t, "healthy", status["status"]) + + workers := status["workers"].(map[string]WorkerHealth) + assert.Len(t, workers, 1) + assert.Equal(t, WorkerStatusHealthy, workers["worker-1"].Status) +} + +func TestHealthTracker_MarkFailed(t *testing.T) { + t.Parallel() + + tracker := NewHealthTracker() + + tracker.MarkFailed("worker-1") + + status := tracker.GetStatus() + assert.Equal(t, "failed", status["status"]) + + workers := status["workers"].(map[string]WorkerHealth) + assert.Len(t, workers, 1) + assert.Equal(t, WorkerStatusFailed, workers["worker-1"].Status) +} + +func TestHealthTracker_IsHealthy_AllHealthy(t *testing.T) { + t.Parallel() + + tracker := NewHealthTracker() + + tracker.MarkHealthy("worker-1") + tracker.MarkHealthy("worker-2") + + assert.True(t, tracker.IsHealthy()) +} + +func TestHealthTracker_IsHealthy_OneFailed(t *testing.T) { + t.Parallel() + + tracker := NewHealthTracker() + + tracker.MarkHealthy("worker-1") + tracker.MarkFailed("worker-2") + + assert.False(t, tracker.IsHealthy()) +} + +func TestHealthTracker_NoErrorExposed(t *testing.T) { + t.Parallel() + + tracker := NewHealthTracker() + + tracker.MarkFailed("worker-1") + + status := tracker.GetStatus() + workers := status["workers"].(map[string]WorkerHealth) + + // Verify that error details are NOT exposed + health := workers["worker-1"] + assert.Equal(t, WorkerStatusFailed, health.Status) + // Verify WorkerHealth struct has no Error field (compile-time check via struct) + // If Error field existed, this would have compile error + _ = WorkerHealth{ + Name: "test", + Status: "healthy", + LastCheck: time.Now(), + } +} + +func TestWorkerRegistry_RegisterWorker(t *testing.T) { + logger := newMockLogger() + registry := NewWorkerRegistry(logger) + + worker := newMockWorker("test-worker", nil) + registry.Register(worker) + + assert.Len(t, registry.workers, 1) + assert.True(t, logger.Contains("worker registered")) +} + +func TestWorkerRegistry_RegisterDuplicateWorker(t *testing.T) { + logger := newMockLogger() + registry := NewWorkerRegistry(logger) + + worker1 := newMockWorker("test-worker", nil) + worker2 := newMockWorker("test-worker", nil) + + registry.Register(worker1) + + assert.Panics(t, func() { + registry.Register(worker2) + }) +} + +func TestWorkerRegistry_Run_HealthyWorkers(t *testing.T) { + logger := newMockLogger() + registry := NewWorkerRegistry(logger) + + worker1 := newMockWorker("worker-1", func(ctx context.Context) error { + <-ctx.Done() + return nil + }) + worker2 := newMockWorker("worker-2", func(ctx context.Context) error { + <-ctx.Done() + return nil + }) + + registry.Register(worker1) + registry.Register(worker2) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + errChan := make(chan error, 1) + go func() { + errChan <- registry.Run(ctx) + }() + + // Give workers time to start + time.Sleep(50 * time.Millisecond) + + // Verify workers started + assert.True(t, worker1.WasStarted()) + assert.True(t, worker2.WasStarted()) + + // Verify health + assert.True(t, registry.GetHealthTracker().IsHealthy()) + + // Cancel context and verify graceful shutdown + cancel() + + err := <-errChan + assert.NoError(t, err) +} + +func TestWorkerRegistry_Run_FailedWorker(t *testing.T) { + logger := newMockLogger() + registry := NewWorkerRegistry(logger) + + healthyWorker := newMockWorker("healthy", func(ctx context.Context) error { + <-ctx.Done() + return nil + }) + + failingWorker := newMockWorker("failing", func(ctx context.Context) error { + time.Sleep(50 * time.Millisecond) + return errors.New("worker failed") + }) + + registry.Register(healthyWorker) + registry.Register(failingWorker) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + errChan := make(chan error, 1) + go func() { + errChan <- registry.Run(ctx) + }() + + // Wait for failing worker to fail + time.Sleep(100 * time.Millisecond) + + // Verify health reflects failure while registry is still running + assert.False(t, registry.GetHealthTracker().IsHealthy()) + + status := registry.GetHealthTracker().GetStatus() + assert.Equal(t, "failed", status["status"]) + + workers := status["workers"].(map[string]WorkerHealth) + assert.Equal(t, WorkerStatusFailed, workers["failing"].Status) + + // Verify that registry is still blocking (hasn't returned yet) + select { + case <-errChan: + t.Fatal("registry.Run() returned early - should keep running until context cancelled") + default: + // Good - still blocking + } + + // Now cancel context and verify graceful shutdown + cancel() + err := <-errChan + assert.NoError(t, err) // Graceful shutdown should return nil +} + +func TestWorkerRegistry_Run_AllWorkersExit(t *testing.T) { + logger := newMockLogger() + registry := NewWorkerRegistry(logger) + + // Both workers exit on their own (not from context cancellation) + worker1 := newMockWorker("worker-1", func(ctx context.Context) error { + time.Sleep(50 * time.Millisecond) + return errors.New("worker 1 failed") + }) + + worker2 := newMockWorker("worker-2", func(ctx context.Context) error { + time.Sleep(100 * time.Millisecond) + return errors.New("worker 2 failed") + }) + + registry.Register(worker1) + registry.Register(worker2) + + ctx := context.Background() + + errChan := make(chan error, 1) + go func() { + errChan <- registry.Run(ctx) + }() + + // Wait for both workers to exit + err := <-errChan + assert.NoError(t, err) // Should return nil when all workers exit + + // Verify both workers are marked as failed + assert.False(t, registry.GetHealthTracker().IsHealthy()) + + status := registry.GetHealthTracker().GetStatus() + assert.Equal(t, "failed", status["status"]) + + workers := status["workers"].(map[string]WorkerHealth) + assert.Equal(t, WorkerStatusFailed, workers["worker-1"].Status) + assert.Equal(t, WorkerStatusFailed, workers["worker-2"].Status) + + // Verify log message + assert.True(t, logger.Contains("all workers have exited")) +} + +func TestWorkerRegistry_Run_ContextCanceled(t *testing.T) { + logger := newMockLogger() + registry := NewWorkerRegistry(logger) + + worker := newMockWorker("worker-1", func(ctx context.Context) error { + <-ctx.Done() + return context.Canceled + }) + + registry.Register(worker) + + ctx, cancel := context.WithCancel(context.Background()) + + errChan := make(chan error, 1) + go func() { + errChan <- registry.Run(ctx) + }() + + // Give worker time to start + time.Sleep(50 * time.Millisecond) + + // Cancel context + cancel() + + err := <-errChan + assert.NoError(t, err) // context.Canceled is not treated as error +} + +func TestWorkerRegistry_Run_NoWorkers(t *testing.T) { + logger := newMockLogger() + registry := NewWorkerRegistry(logger) + + ctx := context.Background() + err := registry.Run(ctx) + + assert.NoError(t, err) + assert.True(t, logger.Contains("no workers registered")) +} + +func TestHealthTracker_ConcurrentAccess(t *testing.T) { + t.Parallel() + + tracker := NewHealthTracker() + + var wg sync.WaitGroup + workers := 100 + + // Concurrent writes + for i := 0; i < workers; i++ { + wg.Add(1) + go func(i int) { + defer wg.Done() + name := fmt.Sprintf("worker-%d", i) + if i%2 == 0 { + tracker.MarkHealthy(name) + } else { + tracker.MarkFailed(name) + } + }(i) + } + + // Concurrent reads + for i := 0; i < workers; i++ { + wg.Add(1) + go func() { + defer wg.Done() + _ = tracker.IsHealthy() + _ = tracker.GetStatus() + }() + } + + wg.Wait() + + // Verify final state + status := tracker.GetStatus() + workersMap := status["workers"].(map[string]WorkerHealth) + assert.Len(t, workersMap, workers) +} + +func TestWorkerRegistry_Run_VariableShutdownTiming(t *testing.T) { + logger := newMockLogger() + registry := NewWorkerRegistry(logger) + + // Worker that shuts down quickly (50ms) + fastWorker := newMockWorker("fast", func(ctx context.Context) error { + <-ctx.Done() + time.Sleep(50 * time.Millisecond) + return nil + }) + + // Worker that shuts down slowly (200ms) + slowWorker := newMockWorker("slow", func(ctx context.Context) error { + <-ctx.Done() + time.Sleep(200 * time.Millisecond) + return nil + }) + + // Worker that shuts down instantly + instantWorker := newMockWorker("instant", func(ctx context.Context) error { + <-ctx.Done() + return nil + }) + + registry.Register(fastWorker) + registry.Register(slowWorker) + registry.Register(instantWorker) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + errChan := make(chan error, 1) + start := time.Now() + go func() { + errChan <- registry.Run(ctx) + }() + + // Give workers time to start + time.Sleep(50 * time.Millisecond) + + // Cancel context and verify graceful shutdown + cancel() + + err := <-errChan + elapsed := time.Since(start) + + assert.NoError(t, err) + + // Registry should wait for the slowest worker (200ms) + // Total time should be at least 200ms (slow worker) + some overhead + assert.True(t, elapsed >= 200*time.Millisecond, + "expected shutdown to take at least 200ms (slowest worker), got %v", elapsed) + + // But not too much longer (should complete within 300ms) + assert.True(t, elapsed < 300*time.Millisecond, + "shutdown took too long: %v", elapsed) +} + +func TestWorkerRegistry_Run_VerySlowShutdown_NoTimeout(t *testing.T) { + logger := newMockLogger() + registry := NewWorkerRegistry(logger) // No timeout + + // Worker that takes a very long time to shutdown (2 seconds) + verySlowWorker := newMockWorker("very-slow", func(ctx context.Context) error { + <-ctx.Done() + time.Sleep(2 * time.Second) + return nil + }) + + registry.Register(verySlowWorker) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + errChan := make(chan error, 1) + start := time.Now() + go func() { + errChan <- registry.Run(ctx) + }() + + // Give worker time to start + time.Sleep(50 * time.Millisecond) + + // Cancel context + cancel() + + // Wait for shutdown with timeout + select { + case err := <-errChan: + elapsed := time.Since(start) + assert.NoError(t, err) + + // Should wait the full 2 seconds for worker to finish + assert.True(t, elapsed >= 2*time.Second, + "expected to wait at least 2s for slow worker, got %v", elapsed) + + t.Logf("Registry waited %v for slow worker to shutdown gracefully (no timeout)", elapsed) + + case <-time.After(3 * time.Second): + t.Fatal("Registry.Run() blocked for more than 3 seconds") + } +} + +func TestWorkerRegistry_Run_ShutdownTimeout(t *testing.T) { + logger := newMockLogger() + // Set shutdown timeout to 500ms + registry := NewWorkerRegistry(logger, WithShutdownTimeout(500*time.Millisecond)) + + // Worker that takes 2 seconds to shutdown (longer than timeout) + slowWorker := newMockWorker("slow", func(ctx context.Context) error { + <-ctx.Done() + time.Sleep(2 * time.Second) + return nil + }) + + registry.Register(slowWorker) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + errChan := make(chan error, 1) + start := time.Now() + go func() { + errChan <- registry.Run(ctx) + }() + + // Give worker time to start + time.Sleep(50 * time.Millisecond) + + // Cancel context + cancel() + + err := <-errChan + elapsed := time.Since(start) + + // Should return timeout error + require.Error(t, err) + assert.Contains(t, err.Error(), "shutdown timeout exceeded") + + // Should return after ~500ms (timeout), not 2s (worker shutdown time) + assert.True(t, elapsed >= 500*time.Millisecond, + "expected to wait at least 500ms (timeout), got %v", elapsed) + assert.True(t, elapsed < 1*time.Second, + "expected to timeout before 1s, got %v", elapsed) + + t.Logf("Registry timed out after %v (expected ~500ms)", elapsed) +} + +func TestWorkerRegistry_Run_ShutdownTimeout_FastWorkers(t *testing.T) { + logger := newMockLogger() + // Set shutdown timeout to 2s + registry := NewWorkerRegistry(logger, WithShutdownTimeout(2*time.Second)) + + // Workers that shutdown quickly (100ms) + fastWorker := newMockWorker("fast", func(ctx context.Context) error { + <-ctx.Done() + time.Sleep(100 * time.Millisecond) + return nil + }) + + registry.Register(fastWorker) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + errChan := make(chan error, 1) + start := time.Now() + go func() { + errChan <- registry.Run(ctx) + }() + + // Give worker time to start + time.Sleep(50 * time.Millisecond) + + // Cancel context + cancel() + + err := <-errChan + elapsed := time.Since(start) + + // Should NOT timeout since worker finishes quickly + assert.NoError(t, err) + + // Should return after ~100ms (worker shutdown time), not 2s (timeout) + assert.True(t, elapsed >= 100*time.Millisecond, + "expected to wait at least 100ms, got %v", elapsed) + assert.True(t, elapsed < 500*time.Millisecond, + "shutdown took too long: %v", elapsed) + + t.Logf("Registry shutdown in %v (workers finished before timeout)", elapsed) +} + +func TestWorkerRegistry_Run_StuckWorker(t *testing.T) { + logger := newMockLogger() + registry := NewWorkerRegistry(logger) + + // Worker that never shuts down (ignores context cancellation) + stuckWorker := newMockWorker("stuck", func(ctx context.Context) error { + // Ignores context, blocks forever + select {} + }) + + registry.Register(stuckWorker) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + errChan := make(chan error, 1) + go func() { + errChan <- registry.Run(ctx) + }() + + // Give worker time to start + time.Sleep(50 * time.Millisecond) + + // Cancel context + cancel() + + // Verify that registry blocks indefinitely waiting for stuck worker + select { + case <-errChan: + t.Fatal("Registry.Run() returned but worker is stuck - should block forever") + case <-time.After(500 * time.Millisecond): + // Expected: registry is still waiting for stuck worker + t.Log("Registry correctly blocks waiting for stuck worker (this is expected behavior)") + } + + // Note: In production, this is why orchestrators need timeouts for pod termination + // Kubernetes will forcefully kill pods that don't shutdown within terminationGracePeriodSeconds +} From 3587802088352ecc6cd2c59b188b86dbc02f242a Mon Sep 17 00:00:00 2001 From: Alex Luong Date: Fri, 7 Nov 2025 20:30:36 +0700 Subject: [PATCH 02/19] refactor: move api logic to apirouter pkg --- .../api => apirouter}/auth_middleware.go | 2 +- .../api => apirouter}/auth_middleware_test.go | 51 ++++++++++--------- .../api => apirouter}/destination_handlers.go | 2 +- .../errorhandler_middleware.go | 2 +- internal/{services/api => apirouter}/jwt.go | 2 +- .../{services/api => apirouter}/jwt_test.go | 39 +++++++------- .../api => apirouter}/latency_middleware.go | 2 +- .../latency_middleware_test.go | 9 ++-- .../api => apirouter}/log_handlers.go | 2 +- .../api => apirouter}/logger_middleware.go | 2 +- .../logger_middleware_integration_test.go | 8 +-- .../api => apirouter}/metrics_middleware.go | 2 +- .../api => apirouter}/publish_handlers.go | 2 +- .../publish_handlers_test.go | 2 +- .../requiretenant_middleware.go | 2 +- .../requiretenant_middleware_test.go | 2 +- .../api => apirouter}/retry_handlers.go | 2 +- .../{services/api => apirouter}/router.go | 2 +- .../api => apirouter}/router_test.go | 13 ++--- .../{services/api => apirouter}/sanitizer.go | 2 +- .../api => apirouter}/sanitizer_test.go | 2 +- .../api => apirouter}/tenant_handlers.go | 2 +- .../api => apirouter}/tenant_handlers_test.go | 2 +- .../api => apirouter}/topic_handlers.go | 2 +- .../{services/api => apirouter}/validate.go | 2 +- internal/services/api/api.go | 5 +- 26 files changed, 85 insertions(+), 80 deletions(-) rename internal/{services/api => apirouter}/auth_middleware.go (99%) rename internal/{services/api => apirouter}/auth_middleware_test.go (89%) rename internal/{services/api => apirouter}/destination_handlers.go (99%) rename internal/{services/api => apirouter}/errorhandler_middleware.go (99%) rename internal/{services/api => apirouter}/jwt.go (98%) rename internal/{services/api => apirouter}/jwt_test.go (71%) rename internal/{services/api => apirouter}/latency_middleware.go (96%) rename internal/{services/api => apirouter}/latency_middleware_test.go (89%) rename internal/{services/api => apirouter}/log_handlers.go (99%) rename internal/{services/api => apirouter}/logger_middleware.go (99%) rename internal/{services/api => apirouter}/logger_middleware_integration_test.go (98%) rename internal/{services/api => apirouter}/metrics_middleware.go (96%) rename internal/{services/api => apirouter}/publish_handlers.go (99%) rename internal/{services/api => apirouter}/publish_handlers_test.go (97%) rename internal/{services/api => apirouter}/requiretenant_middleware.go (98%) rename internal/{services/api => apirouter}/requiretenant_middleware_test.go (98%) rename internal/{services/api => apirouter}/retry_handlers.go (99%) rename internal/{services/api => apirouter}/router.go (99%) rename internal/{services/api => apirouter}/router_test.go (97%) rename internal/{services/api => apirouter}/sanitizer.go (99%) rename internal/{services/api => apirouter}/sanitizer_test.go (99%) rename internal/{services/api => apirouter}/tenant_handlers.go (99%) rename internal/{services/api => apirouter}/tenant_handlers_test.go (99%) rename internal/{services/api => apirouter}/topic_handlers.go (95%) rename internal/{services/api => apirouter}/validate.go (95%) diff --git a/internal/services/api/auth_middleware.go b/internal/apirouter/auth_middleware.go similarity index 99% rename from internal/services/api/auth_middleware.go rename to internal/apirouter/auth_middleware.go index 3ebc01b6..d28797da 100644 --- a/internal/services/api/auth_middleware.go +++ b/internal/apirouter/auth_middleware.go @@ -1,4 +1,4 @@ -package api +package apirouter import ( "errors" diff --git a/internal/services/api/auth_middleware_test.go b/internal/apirouter/auth_middleware_test.go similarity index 89% rename from internal/services/api/auth_middleware_test.go rename to internal/apirouter/auth_middleware_test.go index 4ade05d0..60db3434 100644 --- a/internal/services/api/auth_middleware_test.go +++ b/internal/apirouter/auth_middleware_test.go @@ -1,4 +1,4 @@ -package api_test +package apirouter_test import ( "net/http" @@ -6,8 +6,9 @@ import ( "testing" "github.com/gin-gonic/gin" - api "github.com/hookdeck/outpost/internal/services/api" "github.com/stretchr/testify/assert" + + "github.com/hookdeck/outpost/internal/apirouter" ) func TestPublicRouter(t *testing.T) { @@ -99,7 +100,7 @@ func TestSetTenantIDMiddleware(t *testing.T) { // Create a middleware chain var tenantID string - handler := api.SetTenantIDMiddleware() + handler := apirouter.SetTenantIDMiddleware() nextHandler := func(c *gin.Context) { val, exists := c.Get("tenantID") if exists { @@ -124,7 +125,7 @@ func TestSetTenantIDMiddleware(t *testing.T) { // Create a middleware chain var tenantIDExists bool - handler := api.SetTenantIDMiddleware() + handler := apirouter.SetTenantIDMiddleware() nextHandler := func(c *gin.Context) { _, tenantIDExists = c.Get("tenantID") } @@ -145,7 +146,7 @@ func TestSetTenantIDMiddleware(t *testing.T) { // Create a middleware chain var tenantIDExists bool - handler := api.SetTenantIDMiddleware() + handler := apirouter.SetTenantIDMiddleware() nextHandler := func(c *gin.Context) { _, tenantIDExists = c.Get("tenantID") } @@ -175,7 +176,7 @@ func TestAPIKeyOrTenantJWTAuthMiddleware(t *testing.T) { c.Params = []gin.Param{{Key: "tenantID", Value: "different_tenant"}} // Create JWT token for tenantID - token, err := api.JWT.New(jwtSecret, tenantID) + token, err := apirouter.JWT.New(jwtSecret, tenantID) if err != nil { t.Fatal(err) } @@ -185,7 +186,7 @@ func TestAPIKeyOrTenantJWTAuthMiddleware(t *testing.T) { c.Request.Header.Set("Authorization", "Bearer "+token) // Test - handler := api.APIKeyOrTenantJWTAuthMiddleware(apiKey, jwtSecret) + handler := apirouter.APIKeyOrTenantJWTAuthMiddleware(apiKey, jwtSecret) handler(c) assert.Equal(t, http.StatusUnauthorized, c.Writer.Status()) @@ -200,7 +201,7 @@ func TestAPIKeyOrTenantJWTAuthMiddleware(t *testing.T) { c.Params = []gin.Param{{Key: "tenantID", Value: tenantID}} // Create JWT token for tenantID - token, err := api.JWT.New(jwtSecret, tenantID) + token, err := apirouter.JWT.New(jwtSecret, tenantID) if err != nil { t.Fatal(err) } @@ -211,7 +212,7 @@ func TestAPIKeyOrTenantJWTAuthMiddleware(t *testing.T) { // Create a middleware chain var contextTenantID string - handler := api.APIKeyOrTenantJWTAuthMiddleware(apiKey, jwtSecret) + handler := apirouter.APIKeyOrTenantJWTAuthMiddleware(apiKey, jwtSecret) nextHandler := func(c *gin.Context) { val, exists := c.Get("tenantID") if exists { @@ -242,7 +243,7 @@ func TestAPIKeyOrTenantJWTAuthMiddleware(t *testing.T) { c.Request.Header.Set("Authorization", "Bearer "+apiKey) // Test - handler := api.APIKeyOrTenantJWTAuthMiddleware(apiKey, jwtSecret) + handler := apirouter.APIKeyOrTenantJWTAuthMiddleware(apiKey, jwtSecret) handler(c) assert.NotEqual(t, http.StatusUnauthorized, c.Writer.Status()) @@ -250,7 +251,7 @@ func TestAPIKeyOrTenantJWTAuthMiddleware(t *testing.T) { } func newJWTToken(t *testing.T, secret string, tenantID string) string { - token, err := api.JWT.New(secret, tenantID) + token, err := apirouter.JWT.New(secret, tenantID) if err != nil { t.Fatal(err) } @@ -338,7 +339,7 @@ func TestTenantJWTAuthMiddleware(t *testing.T) { c.Params = []gin.Param{{Key: "tenantID", Value: tt.paramTenantID}} } - handler := api.TenantJWTAuthMiddleware(tt.apiKey, tt.jwtSecret) + handler := apirouter.TenantJWTAuthMiddleware(tt.apiKey, tt.jwtSecret) handler(c) t.Logf("Test case: %s, Expected: %d, Got: %d", tt.name, tt.wantStatus, w.Code) @@ -362,7 +363,7 @@ func TestAuthRole(t *testing.T) { c, _ := gin.CreateTestContext(w) c.Request = httptest.NewRequest(http.MethodGet, "/", nil) - handler := api.APIKeyAuthMiddleware("") + handler := apirouter.APIKeyAuthMiddleware("") var role string nextHandler := func(c *gin.Context) { val, exists := c.Get("authRole") @@ -374,7 +375,7 @@ func TestAuthRole(t *testing.T) { handler(c) nextHandler(c) - assert.Equal(t, api.RoleAdmin, role) + assert.Equal(t, apirouter.RoleAdmin, role) }) t.Run("should set RoleAdmin when valid API key", func(t *testing.T) { @@ -383,7 +384,7 @@ func TestAuthRole(t *testing.T) { c.Request = httptest.NewRequest(http.MethodGet, "/", nil) c.Request.Header.Set("Authorization", "Bearer key") - handler := api.APIKeyAuthMiddleware("key") + handler := apirouter.APIKeyAuthMiddleware("key") var role string nextHandler := func(c *gin.Context) { val, exists := c.Get("authRole") @@ -395,7 +396,7 @@ func TestAuthRole(t *testing.T) { handler(c) nextHandler(c) - assert.Equal(t, api.RoleAdmin, role) + assert.Equal(t, apirouter.RoleAdmin, role) }) }) @@ -405,7 +406,7 @@ func TestAuthRole(t *testing.T) { c, _ := gin.CreateTestContext(w) c.Request = httptest.NewRequest(http.MethodGet, "/", nil) - handler := api.APIKeyOrTenantJWTAuthMiddleware("", "jwt_secret") + handler := apirouter.APIKeyOrTenantJWTAuthMiddleware("", "jwt_secret") var role string nextHandler := func(c *gin.Context) { val, exists := c.Get("authRole") @@ -417,7 +418,7 @@ func TestAuthRole(t *testing.T) { handler(c) nextHandler(c) - assert.Equal(t, api.RoleAdmin, role) + assert.Equal(t, apirouter.RoleAdmin, role) }) t.Run("should set RoleAdmin when using API key", func(t *testing.T) { @@ -426,7 +427,7 @@ func TestAuthRole(t *testing.T) { c.Request = httptest.NewRequest(http.MethodGet, "/", nil) c.Request.Header.Set("Authorization", "Bearer key") - handler := api.APIKeyOrTenantJWTAuthMiddleware("key", "jwt_secret") + handler := apirouter.APIKeyOrTenantJWTAuthMiddleware("key", "jwt_secret") var role string nextHandler := func(c *gin.Context) { val, exists := c.Get("authRole") @@ -438,7 +439,7 @@ func TestAuthRole(t *testing.T) { handler(c) nextHandler(c) - assert.Equal(t, api.RoleAdmin, role) + assert.Equal(t, apirouter.RoleAdmin, role) }) t.Run("should set RoleTenant when using valid JWT", func(t *testing.T) { @@ -448,7 +449,7 @@ func TestAuthRole(t *testing.T) { token := newJWTToken(t, "jwt_secret", "tenant-id") c.Request.Header.Set("Authorization", "Bearer "+token) - handler := api.APIKeyOrTenantJWTAuthMiddleware("key", "jwt_secret") + handler := apirouter.APIKeyOrTenantJWTAuthMiddleware("key", "jwt_secret") var role string nextHandler := func(c *gin.Context) { val, exists := c.Get("authRole") @@ -460,7 +461,7 @@ func TestAuthRole(t *testing.T) { handler(c) nextHandler(c) - assert.Equal(t, api.RoleTenant, role) + assert.Equal(t, apirouter.RoleTenant, role) }) }) @@ -472,7 +473,7 @@ func TestAuthRole(t *testing.T) { token := newJWTToken(t, "jwt_secret", "tenant-id") c.Request.Header.Set("Authorization", "Bearer "+token) - handler := api.TenantJWTAuthMiddleware("key", "jwt_secret") + handler := apirouter.TenantJWTAuthMiddleware("key", "jwt_secret") var role string nextHandler := func(c *gin.Context) { val, exists := c.Get("authRole") @@ -484,7 +485,7 @@ func TestAuthRole(t *testing.T) { handler(c) nextHandler(c) - assert.Equal(t, api.RoleTenant, role) + assert.Equal(t, apirouter.RoleTenant, role) }) t.Run("should not set role when apiKey is empty", func(t *testing.T) { @@ -494,7 +495,7 @@ func TestAuthRole(t *testing.T) { token := newJWTToken(t, "jwt_secret", "tenant-id") c.Request.Header.Set("Authorization", "Bearer "+token) - handler := api.TenantJWTAuthMiddleware("", "jwt_secret") + handler := apirouter.TenantJWTAuthMiddleware("", "jwt_secret") var roleExists bool nextHandler := func(c *gin.Context) { _, roleExists = c.Get("authRole") diff --git a/internal/services/api/destination_handlers.go b/internal/apirouter/destination_handlers.go similarity index 99% rename from internal/services/api/destination_handlers.go rename to internal/apirouter/destination_handlers.go index fdb0eac1..68292215 100644 --- a/internal/services/api/destination_handlers.go +++ b/internal/apirouter/destination_handlers.go @@ -1,4 +1,4 @@ -package api +package apirouter import ( "errors" diff --git a/internal/services/api/errorhandler_middleware.go b/internal/apirouter/errorhandler_middleware.go similarity index 99% rename from internal/services/api/errorhandler_middleware.go rename to internal/apirouter/errorhandler_middleware.go index 979295c9..2bf74901 100644 --- a/internal/services/api/errorhandler_middleware.go +++ b/internal/apirouter/errorhandler_middleware.go @@ -1,4 +1,4 @@ -package api +package apirouter import ( "encoding/json" diff --git a/internal/services/api/jwt.go b/internal/apirouter/jwt.go similarity index 98% rename from internal/services/api/jwt.go rename to internal/apirouter/jwt.go index 5c1bb79c..d0f5f5c6 100644 --- a/internal/services/api/jwt.go +++ b/internal/apirouter/jwt.go @@ -1,4 +1,4 @@ -package api +package apirouter import ( "errors" diff --git a/internal/services/api/jwt_test.go b/internal/apirouter/jwt_test.go similarity index 71% rename from internal/services/api/jwt_test.go rename to internal/apirouter/jwt_test.go index 5d01c2fd..5a90acbd 100644 --- a/internal/services/api/jwt_test.go +++ b/internal/apirouter/jwt_test.go @@ -1,12 +1,13 @@ -package api_test +package apirouter_test import ( "testing" "time" "github.com/golang-jwt/jwt/v5" - api "github.com/hookdeck/outpost/internal/services/api" "github.com/stretchr/testify/assert" + + "github.com/hookdeck/outpost/internal/apirouter" ) func TestJWT(t *testing.T) { @@ -19,37 +20,37 @@ func TestJWT(t *testing.T) { t.Run("should generate a new jwt token", func(t *testing.T) { t.Parallel() - token, err := api.JWT.New(jwtKey, tenantID) + token, err := apirouter.JWT.New(jwtKey, tenantID) assert.Nil(t, err) assert.NotEqual(t, "", token) }) t.Run("should verify a valid jwt token", func(t *testing.T) { t.Parallel() - token, err := api.JWT.New(jwtKey, tenantID) + token, err := apirouter.JWT.New(jwtKey, tenantID) if err != nil { t.Fatal(err) } - valid, err := api.JWT.Verify(jwtKey, token, tenantID) + valid, err := apirouter.JWT.Verify(jwtKey, token, tenantID) assert.Nil(t, err) assert.True(t, valid) }) t.Run("should extract tenantID from valid token", func(t *testing.T) { t.Parallel() - token, err := api.JWT.New(jwtKey, tenantID) + token, err := apirouter.JWT.New(jwtKey, tenantID) if err != nil { t.Fatal(err) } - extractedTenantID, err := api.JWT.ExtractTenantID(jwtKey, token) + extractedTenantID, err := apirouter.JWT.ExtractTenantID(jwtKey, token) assert.Nil(t, err) assert.Equal(t, tenantID, extractedTenantID) }) t.Run("should fail to extract tenantID from invalid token", func(t *testing.T) { t.Parallel() - _, err := api.JWT.ExtractTenantID(jwtKey, "invalid_token") - assert.ErrorIs(t, err, api.ErrInvalidToken) + _, err := apirouter.JWT.ExtractTenantID(jwtKey, "invalid_token") + assert.ErrorIs(t, err, apirouter.ErrInvalidToken) }) t.Run("should fail to extract tenantID from token with invalid issuer", func(t *testing.T) { @@ -65,14 +66,14 @@ func TestJWT(t *testing.T) { if err != nil { t.Fatal(err) } - _, err = api.JWT.ExtractTenantID(jwtKey, token) - assert.ErrorIs(t, err, api.ErrInvalidToken) + _, err = apirouter.JWT.ExtractTenantID(jwtKey, token) + assert.ErrorIs(t, err, apirouter.ErrInvalidToken) }) t.Run("should reject an invalid token", func(t *testing.T) { t.Parallel() - valid, err := api.JWT.Verify(jwtKey, "invalid_token", tenantID) - assert.ErrorIs(t, err, api.ErrInvalidToken) + valid, err := apirouter.JWT.Verify(jwtKey, "invalid_token", tenantID) + assert.ErrorIs(t, err, apirouter.ErrInvalidToken) assert.False(t, valid) }) @@ -89,8 +90,8 @@ func TestJWT(t *testing.T) { if err != nil { t.Fatal(err) } - valid, err := api.JWT.Verify(jwtKey, token, tenantID) - assert.ErrorIs(t, err, api.ErrInvalidToken) + valid, err := apirouter.JWT.Verify(jwtKey, token, tenantID) + assert.ErrorIs(t, err, apirouter.ErrInvalidToken) assert.False(t, valid) }) @@ -107,8 +108,8 @@ func TestJWT(t *testing.T) { if err != nil { t.Fatal(err) } - valid, err := api.JWT.Verify(jwtKey, token, tenantID) - assert.ErrorIs(t, err, api.ErrInvalidToken) + valid, err := apirouter.JWT.Verify(jwtKey, token, tenantID) + assert.ErrorIs(t, err, apirouter.ErrInvalidToken) assert.False(t, valid) }) @@ -125,8 +126,8 @@ func TestJWT(t *testing.T) { if err != nil { t.Fatal(err) } - valid, err := api.JWT.Verify(jwtKey, token, tenantID) - assert.ErrorIs(t, err, api.ErrInvalidToken) + valid, err := apirouter.JWT.Verify(jwtKey, token, tenantID) + assert.ErrorIs(t, err, apirouter.ErrInvalidToken) assert.False(t, valid) }) } diff --git a/internal/services/api/latency_middleware.go b/internal/apirouter/latency_middleware.go similarity index 96% rename from internal/services/api/latency_middleware.go rename to internal/apirouter/latency_middleware.go index 66efabea..ea1bbeb7 100644 --- a/internal/services/api/latency_middleware.go +++ b/internal/apirouter/latency_middleware.go @@ -1,4 +1,4 @@ -package api +package apirouter import ( "time" diff --git a/internal/services/api/latency_middleware_test.go b/internal/apirouter/latency_middleware_test.go similarity index 89% rename from internal/services/api/latency_middleware_test.go rename to internal/apirouter/latency_middleware_test.go index 563c152f..5ec55d28 100644 --- a/internal/services/api/latency_middleware_test.go +++ b/internal/apirouter/latency_middleware_test.go @@ -1,4 +1,4 @@ -package api +package apirouter_test import ( "net/http" @@ -7,6 +7,7 @@ import ( "time" "github.com/gin-gonic/gin" + "github.com/hookdeck/outpost/internal/apirouter" "github.com/stretchr/testify/assert" ) @@ -24,7 +25,7 @@ func TestMiddlewareOrder(t *testing.T) { executionOrder = append(executionOrder, "metrics_start") c.Next() executionOrder = append(executionOrder, "metrics_end") - metricsLatency = GetRequestLatency(c) + metricsLatency = apirouter.GetRequestLatency(c) } } @@ -34,7 +35,7 @@ func TestMiddlewareOrder(t *testing.T) { executionOrder = append(executionOrder, "logger_start") c.Next() executionOrder = append(executionOrder, "logger_end") - loggerLatency = GetRequestLatency(c) + loggerLatency = apirouter.GetRequestLatency(c) } } @@ -42,7 +43,7 @@ func TestMiddlewareOrder(t *testing.T) { r := gin.New() r.Use(mockMetrics()) r.Use(mockLogger()) - r.Use(LatencyMiddleware()) + r.Use(apirouter.LatencyMiddleware()) // Add a handler that sleeps to simulate work r.GET("/test", func(c *gin.Context) { diff --git a/internal/services/api/log_handlers.go b/internal/apirouter/log_handlers.go similarity index 99% rename from internal/services/api/log_handlers.go rename to internal/apirouter/log_handlers.go index ad29e931..e827021b 100644 --- a/internal/services/api/log_handlers.go +++ b/internal/apirouter/log_handlers.go @@ -1,4 +1,4 @@ -package api +package apirouter import ( "net/http" diff --git a/internal/services/api/logger_middleware.go b/internal/apirouter/logger_middleware.go similarity index 99% rename from internal/services/api/logger_middleware.go rename to internal/apirouter/logger_middleware.go index f249528f..0e0cdcf7 100644 --- a/internal/services/api/logger_middleware.go +++ b/internal/apirouter/logger_middleware.go @@ -1,4 +1,4 @@ -package api +package apirouter import ( "fmt" diff --git a/internal/services/api/logger_middleware_integration_test.go b/internal/apirouter/logger_middleware_integration_test.go similarity index 98% rename from internal/services/api/logger_middleware_integration_test.go rename to internal/apirouter/logger_middleware_integration_test.go index 398e6c90..dd92a71a 100644 --- a/internal/services/api/logger_middleware_integration_test.go +++ b/internal/apirouter/logger_middleware_integration_test.go @@ -1,4 +1,4 @@ -package api_test +package apirouter_test import ( "bytes" @@ -15,7 +15,7 @@ import ( "github.com/hookdeck/outpost/internal/destregistry/metadata" "github.com/hookdeck/outpost/internal/logging" "github.com/hookdeck/outpost/internal/models" - "github.com/hookdeck/outpost/internal/services/api" + "github.com/hookdeck/outpost/internal/apirouter" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "github.com/uptrace/opentelemetry-go-extra/otelzap" @@ -134,9 +134,9 @@ func setupTestEnvironment(t *testing.T) (*gin.Engine, *observer.ObservedLogs, de registry := &mockRegistry{loader: metadataLoader} // Create sanitizer and router - sanitizer := api.NewRequestBodySanitizer(registry) + sanitizer := apirouter.NewRequestBodySanitizer(registry) router := gin.New() - router.Use(api.LoggerMiddlewareWithSanitizer(testLogger, sanitizer)) + router.Use(apirouter.LoggerMiddlewareWithSanitizer(testLogger, sanitizer)) return router, logs, registry } diff --git a/internal/services/api/metrics_middleware.go b/internal/apirouter/metrics_middleware.go similarity index 96% rename from internal/services/api/metrics_middleware.go rename to internal/apirouter/metrics_middleware.go index c557bb98..daa8a314 100644 --- a/internal/services/api/metrics_middleware.go +++ b/internal/apirouter/metrics_middleware.go @@ -1,4 +1,4 @@ -package api +package apirouter import ( "github.com/gin-gonic/gin" diff --git a/internal/services/api/publish_handlers.go b/internal/apirouter/publish_handlers.go similarity index 99% rename from internal/services/api/publish_handlers.go rename to internal/apirouter/publish_handlers.go index 69400ddf..bab8354f 100644 --- a/internal/services/api/publish_handlers.go +++ b/internal/apirouter/publish_handlers.go @@ -1,4 +1,4 @@ -package api +package apirouter import ( "errors" diff --git a/internal/services/api/publish_handlers_test.go b/internal/apirouter/publish_handlers_test.go similarity index 97% rename from internal/services/api/publish_handlers_test.go rename to internal/apirouter/publish_handlers_test.go index 07d3d1b1..fdc026f4 100644 --- a/internal/services/api/publish_handlers_test.go +++ b/internal/apirouter/publish_handlers_test.go @@ -1,4 +1,4 @@ -package api_test +package apirouter_test import ( "encoding/json" diff --git a/internal/services/api/requiretenant_middleware.go b/internal/apirouter/requiretenant_middleware.go similarity index 98% rename from internal/services/api/requiretenant_middleware.go rename to internal/apirouter/requiretenant_middleware.go index 2d51fb50..022ea1a0 100644 --- a/internal/services/api/requiretenant_middleware.go +++ b/internal/apirouter/requiretenant_middleware.go @@ -1,4 +1,4 @@ -package api +package apirouter import ( "net/http" diff --git a/internal/services/api/requiretenant_middleware_test.go b/internal/apirouter/requiretenant_middleware_test.go similarity index 98% rename from internal/services/api/requiretenant_middleware_test.go rename to internal/apirouter/requiretenant_middleware_test.go index e5681284..5cddbe0f 100644 --- a/internal/services/api/requiretenant_middleware_test.go +++ b/internal/apirouter/requiretenant_middleware_test.go @@ -1,4 +1,4 @@ -package api_test +package apirouter_test import ( "context" diff --git a/internal/services/api/retry_handlers.go b/internal/apirouter/retry_handlers.go similarity index 99% rename from internal/services/api/retry_handlers.go rename to internal/apirouter/retry_handlers.go index 3f5acb9c..d6035e4b 100644 --- a/internal/services/api/retry_handlers.go +++ b/internal/apirouter/retry_handlers.go @@ -1,4 +1,4 @@ -package api +package apirouter import ( "errors" diff --git a/internal/services/api/router.go b/internal/apirouter/router.go similarity index 99% rename from internal/services/api/router.go rename to internal/apirouter/router.go index 36fc7d8b..a0f3e009 100644 --- a/internal/services/api/router.go +++ b/internal/apirouter/router.go @@ -1,4 +1,4 @@ -package api +package apirouter import ( "errors" diff --git a/internal/services/api/router_test.go b/internal/apirouter/router_test.go similarity index 97% rename from internal/services/api/router_test.go rename to internal/apirouter/router_test.go index dcdf0e47..8c14396b 100644 --- a/internal/services/api/router_test.go +++ b/internal/apirouter/router_test.go @@ -1,4 +1,4 @@ -package api_test +package apirouter_test import ( "context" @@ -18,8 +18,9 @@ import ( "github.com/hookdeck/outpost/internal/models" "github.com/hookdeck/outpost/internal/publishmq" "github.com/hookdeck/outpost/internal/redis" - "github.com/hookdeck/outpost/internal/services/api" "github.com/hookdeck/outpost/internal/telemetry" + + "github.com/hookdeck/outpost/internal/apirouter" "github.com/hookdeck/outpost/internal/util/testutil" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -37,8 +38,8 @@ func setupTestRouter(t *testing.T, apiKey, jwtSecret string, funcs ...func(t *te entityStore := setupTestEntityStore(t, redisClient, nil) logStore := setupTestLogStore(t, funcs...) eventHandler := publishmq.NewEventHandler(logger, deliveryMQ, entityStore, eventTracer, testutil.TestTopics, idempotence.New(redisClient, idempotence.WithSuccessfulTTL(24*time.Hour))) - router := api.NewRouter( - api.RouterConfig{ + router := apirouter.NewRouter( + apirouter.RouterConfig{ ServiceName: "", APIKey: apiKey, JWTSecret: jwtSecret, @@ -88,7 +89,7 @@ func TestRouterWithAPIKey(t *testing.T) { router, _, _ := setupTestRouter(t, apiKey, jwtSecret) tenantID := "tenantID" - validToken, err := api.JWT.New(jwtSecret, tenantID) + validToken, err := apirouter.JWT.New(jwtSecret, tenantID) if err != nil { t.Fatal(err) } @@ -201,7 +202,7 @@ func TestRouterWithoutAPIKey(t *testing.T) { router, _, _ := setupTestRouter(t, apiKey, jwtSecret) tenantID := "tenantID" - validToken, err := api.JWT.New(jwtSecret, tenantID) + validToken, err := apirouter.JWT.New(jwtSecret, tenantID) if err != nil { t.Fatal(err) } diff --git a/internal/services/api/sanitizer.go b/internal/apirouter/sanitizer.go similarity index 99% rename from internal/services/api/sanitizer.go rename to internal/apirouter/sanitizer.go index a4a41b12..97ec8d18 100644 --- a/internal/services/api/sanitizer.go +++ b/internal/apirouter/sanitizer.go @@ -1,4 +1,4 @@ -package api +package apirouter import ( "bytes" diff --git a/internal/services/api/sanitizer_test.go b/internal/apirouter/sanitizer_test.go similarity index 99% rename from internal/services/api/sanitizer_test.go rename to internal/apirouter/sanitizer_test.go index 7d878f85..23f1bb33 100644 --- a/internal/services/api/sanitizer_test.go +++ b/internal/apirouter/sanitizer_test.go @@ -1,4 +1,4 @@ -package api +package apirouter import ( "strings" diff --git a/internal/services/api/tenant_handlers.go b/internal/apirouter/tenant_handlers.go similarity index 99% rename from internal/services/api/tenant_handlers.go rename to internal/apirouter/tenant_handlers.go index ca41de70..059c2fcb 100644 --- a/internal/services/api/tenant_handlers.go +++ b/internal/apirouter/tenant_handlers.go @@ -1,4 +1,4 @@ -package api +package apirouter import ( "net/http" diff --git a/internal/services/api/tenant_handlers_test.go b/internal/apirouter/tenant_handlers_test.go similarity index 99% rename from internal/services/api/tenant_handlers_test.go rename to internal/apirouter/tenant_handlers_test.go index f9a6e065..ef0b1336 100644 --- a/internal/services/api/tenant_handlers_test.go +++ b/internal/apirouter/tenant_handlers_test.go @@ -1,4 +1,4 @@ -package api_test +package apirouter_test import ( "context" diff --git a/internal/services/api/topic_handlers.go b/internal/apirouter/topic_handlers.go similarity index 95% rename from internal/services/api/topic_handlers.go rename to internal/apirouter/topic_handlers.go index 996df67e..8af9e4ea 100644 --- a/internal/services/api/topic_handlers.go +++ b/internal/apirouter/topic_handlers.go @@ -1,4 +1,4 @@ -package api +package apirouter import ( "net/http" diff --git a/internal/services/api/validate.go b/internal/apirouter/validate.go similarity index 95% rename from internal/services/api/validate.go rename to internal/apirouter/validate.go index ec30a44e..a7d9a569 100644 --- a/internal/services/api/validate.go +++ b/internal/apirouter/validate.go @@ -1,4 +1,4 @@ -package api +package apirouter import ( "net/http" diff --git a/internal/services/api/api.go b/internal/services/api/api.go index ffc764e1..cae7847d 100644 --- a/internal/services/api/api.go +++ b/internal/services/api/api.go @@ -8,6 +8,7 @@ import ( "sync" "time" + "github.com/hookdeck/outpost/internal/apirouter" "github.com/hookdeck/outpost/internal/config" "github.com/hookdeck/outpost/internal/consumer" "github.com/hookdeck/outpost/internal/deliverymq" @@ -128,8 +129,8 @@ func NewService(ctx context.Context, wg *sync.WaitGroup, cfg *config.Config, log idempotence.WithSuccessfulTTL(time.Duration(cfg.PublishIdempotencyKeyTTL)*time.Second), ) eventHandler := publishmq.NewEventHandler(logger, deliveryMQ, entityStore, eventTracer, cfg.Topics, publishIdempotence) - router := NewRouter( - RouterConfig{ + router := apirouter.NewRouter( + apirouter.RouterConfig{ ServiceName: cfg.OpenTelemetry.GetServiceName(), APIKey: cfg.APIKey, JWTSecret: cfg.APIJWTSecret, From 23c1487714a98a35561b713a5e79ef44460b0dae Mon Sep 17 00:00:00 2001 From: Alex Luong Date: Tue, 11 Nov 2025 00:51:06 +0700 Subject: [PATCH 03/19] chore: rename worker "registry" to "supervisor" --- internal/worker/logger_test.go | 8 +- .../worker/{registry.go => supervisor.go} | 32 ++-- internal/worker/worker_test.go | 138 +++++++++--------- 3 files changed, 89 insertions(+), 89 deletions(-) rename internal/worker/{registry.go => supervisor.go} (81%) diff --git a/internal/worker/logger_test.go b/internal/worker/logger_test.go index 55c761ad..6dd0f2f4 100644 --- a/internal/worker/logger_test.go +++ b/internal/worker/logger_test.go @@ -15,9 +15,9 @@ func TestLoggingLoggerImplementsInterface(t *testing.T) { // This will fail to compile if *logging.Logger doesn't implement worker.Logger var _ worker.Logger = logger - // Also verify we can actually use it with WorkerRegistry - registry := worker.NewWorkerRegistry(logger) - if registry == nil { - t.Fatal("expected non-nil registry") + // Also verify we can actually use it with WorkerSupervisor + supervisor := worker.NewWorkerSupervisor(logger) + if supervisor == nil { + t.Fatal("expected non-nil supervisor") } } diff --git a/internal/worker/registry.go b/internal/worker/supervisor.go similarity index 81% rename from internal/worker/registry.go rename to internal/worker/supervisor.go index 511d95bd..3ec0b489 100644 --- a/internal/worker/registry.go +++ b/internal/worker/supervisor.go @@ -18,30 +18,30 @@ type Logger interface { Warn(msg string, fields ...zap.Field) } -// WorkerRegistry manages and supervises multiple workers. +// WorkerSupervisor manages and supervises multiple workers. // It tracks their health and handles graceful shutdown. -type WorkerRegistry struct { +type WorkerSupervisor struct { workers map[string]Worker health *HealthTracker logger Logger shutdownTimeout time.Duration // 0 means no timeout } -// RegistryOption configures a WorkerRegistry. -type RegistryOption func(*WorkerRegistry) +// SupervisorOption configures a WorkerSupervisor. +type SupervisorOption func(*WorkerSupervisor) // WithShutdownTimeout sets the maximum time to wait for workers to shutdown gracefully. // After this timeout, Run() will return even if workers haven't finished. // Default is 0 (no timeout - wait indefinitely). -func WithShutdownTimeout(timeout time.Duration) RegistryOption { - return func(r *WorkerRegistry) { +func WithShutdownTimeout(timeout time.Duration) SupervisorOption { + return func(r *WorkerSupervisor) { r.shutdownTimeout = timeout } } -// NewWorkerRegistry creates a new WorkerRegistry. -func NewWorkerRegistry(logger Logger, opts ...RegistryOption) *WorkerRegistry { - r := &WorkerRegistry{ +// NewWorkerSupervisor creates a new WorkerSupervisor. +func NewWorkerSupervisor(logger Logger, opts ...SupervisorOption) *WorkerSupervisor { + r := &WorkerSupervisor{ workers: make(map[string]Worker), health: NewHealthTracker(), logger: logger, @@ -55,9 +55,9 @@ func NewWorkerRegistry(logger Logger, opts ...RegistryOption) *WorkerRegistry { return r } -// Register adds a worker to the registry. +// Register adds a worker to the supervisor. // Panics if a worker with the same name is already registered. -func (r *WorkerRegistry) Register(w Worker) { +func (r *WorkerSupervisor) Register(w Worker) { if _, exists := r.workers[w.Name()]; exists { panic(fmt.Sprintf("worker %s already registered", w.Name())) } @@ -65,8 +65,8 @@ func (r *WorkerRegistry) Register(w Worker) { r.logger.Debug("worker registered", zap.String("worker", w.Name())) } -// GetHealthTracker returns the health tracker for this registry. -func (r *WorkerRegistry) GetHealthTracker() *HealthTracker { +// GetHealthTracker returns the health tracker for this supervisor. +func (r *WorkerSupervisor) GetHealthTracker() *HealthTracker { return r.health } @@ -83,7 +83,7 @@ func (r *WorkerRegistry) GetHealthTracker() *HealthTracker { // // Returns nil if context was cancelled and workers shutdown gracefully. // Returns error if workers failed to shutdown within timeout (if configured). -func (r *WorkerRegistry) Run(ctx context.Context) error { +func (r *WorkerSupervisor) Run(ctx context.Context) error { if len(r.workers) == 0 { r.logger.Warn("no workers registered") return nil @@ -139,7 +139,7 @@ func (r *WorkerRegistry) Run(ctx context.Context) error { // waitForWorkers converts WaitGroup.Wait() into a channel that can be used in select. // Returns a channel that closes when all workers have exited. -func (r *WorkerRegistry) waitForWorkers(wg *sync.WaitGroup) <-chan struct{} { +func (r *WorkerSupervisor) waitForWorkers(wg *sync.WaitGroup) <-chan struct{} { done := make(chan struct{}) go func() { wg.Wait() @@ -151,7 +151,7 @@ func (r *WorkerRegistry) waitForWorkers(wg *sync.WaitGroup) <-chan struct{} { // waitWithTimeout waits for the WaitGroup with a timeout. // Returns nil if all workers finish within timeout. // Returns error if timeout is exceeded. -func (r *WorkerRegistry) waitWithTimeout(wg *sync.WaitGroup, timeout time.Duration) error { +func (r *WorkerSupervisor) waitWithTimeout(wg *sync.WaitGroup, timeout time.Duration) error { select { case <-r.waitForWorkers(wg): r.logger.Info("all workers shutdown gracefully") diff --git a/internal/worker/worker_test.go b/internal/worker/worker_test.go index a1c783de..a36f58ce 100644 --- a/internal/worker/worker_test.go +++ b/internal/worker/worker_test.go @@ -175,34 +175,34 @@ func TestHealthTracker_NoErrorExposed(t *testing.T) { } } -func TestWorkerRegistry_RegisterWorker(t *testing.T) { +func TestWorkerSupervisor_RegisterWorker(t *testing.T) { logger := newMockLogger() - registry := NewWorkerRegistry(logger) + supervisor := NewWorkerSupervisor(logger) worker := newMockWorker("test-worker", nil) - registry.Register(worker) + supervisor.Register(worker) - assert.Len(t, registry.workers, 1) + assert.Len(t, supervisor.workers, 1) assert.True(t, logger.Contains("worker registered")) } -func TestWorkerRegistry_RegisterDuplicateWorker(t *testing.T) { +func TestWorkerSupervisor_RegisterDuplicateWorker(t *testing.T) { logger := newMockLogger() - registry := NewWorkerRegistry(logger) + supervisor := NewWorkerSupervisor(logger) worker1 := newMockWorker("test-worker", nil) worker2 := newMockWorker("test-worker", nil) - registry.Register(worker1) + supervisor.Register(worker1) assert.Panics(t, func() { - registry.Register(worker2) + supervisor.Register(worker2) }) } -func TestWorkerRegistry_Run_HealthyWorkers(t *testing.T) { +func TestWorkerSupervisor_Run_HealthyWorkers(t *testing.T) { logger := newMockLogger() - registry := NewWorkerRegistry(logger) + supervisor := NewWorkerSupervisor(logger) worker1 := newMockWorker("worker-1", func(ctx context.Context) error { <-ctx.Done() @@ -213,15 +213,15 @@ func TestWorkerRegistry_Run_HealthyWorkers(t *testing.T) { return nil }) - registry.Register(worker1) - registry.Register(worker2) + supervisor.Register(worker1) + supervisor.Register(worker2) ctx, cancel := context.WithCancel(context.Background()) defer cancel() errChan := make(chan error, 1) go func() { - errChan <- registry.Run(ctx) + errChan <- supervisor.Run(ctx) }() // Give workers time to start @@ -232,7 +232,7 @@ func TestWorkerRegistry_Run_HealthyWorkers(t *testing.T) { assert.True(t, worker2.WasStarted()) // Verify health - assert.True(t, registry.GetHealthTracker().IsHealthy()) + assert.True(t, supervisor.GetHealthTracker().IsHealthy()) // Cancel context and verify graceful shutdown cancel() @@ -241,9 +241,9 @@ func TestWorkerRegistry_Run_HealthyWorkers(t *testing.T) { assert.NoError(t, err) } -func TestWorkerRegistry_Run_FailedWorker(t *testing.T) { +func TestWorkerSupervisor_Run_FailedWorker(t *testing.T) { logger := newMockLogger() - registry := NewWorkerRegistry(logger) + supervisor := NewWorkerSupervisor(logger) healthyWorker := newMockWorker("healthy", func(ctx context.Context) error { <-ctx.Done() @@ -255,33 +255,33 @@ func TestWorkerRegistry_Run_FailedWorker(t *testing.T) { return errors.New("worker failed") }) - registry.Register(healthyWorker) - registry.Register(failingWorker) + supervisor.Register(healthyWorker) + supervisor.Register(failingWorker) ctx, cancel := context.WithCancel(context.Background()) defer cancel() errChan := make(chan error, 1) go func() { - errChan <- registry.Run(ctx) + errChan <- supervisor.Run(ctx) }() // Wait for failing worker to fail time.Sleep(100 * time.Millisecond) - // Verify health reflects failure while registry is still running - assert.False(t, registry.GetHealthTracker().IsHealthy()) + // Verify health reflects failure while supervisor is still running + assert.False(t, supervisor.GetHealthTracker().IsHealthy()) - status := registry.GetHealthTracker().GetStatus() + status := supervisor.GetHealthTracker().GetStatus() assert.Equal(t, "failed", status["status"]) workers := status["workers"].(map[string]WorkerHealth) assert.Equal(t, WorkerStatusFailed, workers["failing"].Status) - // Verify that registry is still blocking (hasn't returned yet) + // Verify that supervisor is still blocking (hasn't returned yet) select { case <-errChan: - t.Fatal("registry.Run() returned early - should keep running until context cancelled") + t.Fatal("supervisor.Run() returned early - should keep running until context cancelled") default: // Good - still blocking } @@ -292,9 +292,9 @@ func TestWorkerRegistry_Run_FailedWorker(t *testing.T) { assert.NoError(t, err) // Graceful shutdown should return nil } -func TestWorkerRegistry_Run_AllWorkersExit(t *testing.T) { +func TestWorkerSupervisor_Run_AllWorkersExit(t *testing.T) { logger := newMockLogger() - registry := NewWorkerRegistry(logger) + supervisor := NewWorkerSupervisor(logger) // Both workers exit on their own (not from context cancellation) worker1 := newMockWorker("worker-1", func(ctx context.Context) error { @@ -307,14 +307,14 @@ func TestWorkerRegistry_Run_AllWorkersExit(t *testing.T) { return errors.New("worker 2 failed") }) - registry.Register(worker1) - registry.Register(worker2) + supervisor.Register(worker1) + supervisor.Register(worker2) ctx := context.Background() errChan := make(chan error, 1) go func() { - errChan <- registry.Run(ctx) + errChan <- supervisor.Run(ctx) }() // Wait for both workers to exit @@ -322,9 +322,9 @@ func TestWorkerRegistry_Run_AllWorkersExit(t *testing.T) { assert.NoError(t, err) // Should return nil when all workers exit // Verify both workers are marked as failed - assert.False(t, registry.GetHealthTracker().IsHealthy()) + assert.False(t, supervisor.GetHealthTracker().IsHealthy()) - status := registry.GetHealthTracker().GetStatus() + status := supervisor.GetHealthTracker().GetStatus() assert.Equal(t, "failed", status["status"]) workers := status["workers"].(map[string]WorkerHealth) @@ -335,22 +335,22 @@ func TestWorkerRegistry_Run_AllWorkersExit(t *testing.T) { assert.True(t, logger.Contains("all workers have exited")) } -func TestWorkerRegistry_Run_ContextCanceled(t *testing.T) { +func TestWorkerSupervisor_Run_ContextCanceled(t *testing.T) { logger := newMockLogger() - registry := NewWorkerRegistry(logger) + supervisor := NewWorkerSupervisor(logger) worker := newMockWorker("worker-1", func(ctx context.Context) error { <-ctx.Done() return context.Canceled }) - registry.Register(worker) + supervisor.Register(worker) ctx, cancel := context.WithCancel(context.Background()) errChan := make(chan error, 1) go func() { - errChan <- registry.Run(ctx) + errChan <- supervisor.Run(ctx) }() // Give worker time to start @@ -363,12 +363,12 @@ func TestWorkerRegistry_Run_ContextCanceled(t *testing.T) { assert.NoError(t, err) // context.Canceled is not treated as error } -func TestWorkerRegistry_Run_NoWorkers(t *testing.T) { +func TestWorkerSupervisor_Run_NoWorkers(t *testing.T) { logger := newMockLogger() - registry := NewWorkerRegistry(logger) + supervisor := NewWorkerSupervisor(logger) ctx := context.Background() - err := registry.Run(ctx) + err := supervisor.Run(ctx) assert.NoError(t, err) assert.True(t, logger.Contains("no workers registered")) @@ -414,9 +414,9 @@ func TestHealthTracker_ConcurrentAccess(t *testing.T) { assert.Len(t, workersMap, workers) } -func TestWorkerRegistry_Run_VariableShutdownTiming(t *testing.T) { +func TestWorkerSupervisor_Run_VariableShutdownTiming(t *testing.T) { logger := newMockLogger() - registry := NewWorkerRegistry(logger) + supervisor := NewWorkerSupervisor(logger) // Worker that shuts down quickly (50ms) fastWorker := newMockWorker("fast", func(ctx context.Context) error { @@ -438,9 +438,9 @@ func TestWorkerRegistry_Run_VariableShutdownTiming(t *testing.T) { return nil }) - registry.Register(fastWorker) - registry.Register(slowWorker) - registry.Register(instantWorker) + supervisor.Register(fastWorker) + supervisor.Register(slowWorker) + supervisor.Register(instantWorker) ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -448,7 +448,7 @@ func TestWorkerRegistry_Run_VariableShutdownTiming(t *testing.T) { errChan := make(chan error, 1) start := time.Now() go func() { - errChan <- registry.Run(ctx) + errChan <- supervisor.Run(ctx) }() // Give workers time to start @@ -462,7 +462,7 @@ func TestWorkerRegistry_Run_VariableShutdownTiming(t *testing.T) { assert.NoError(t, err) - // Registry should wait for the slowest worker (200ms) + // Supervisor should wait for the slowest worker (200ms) // Total time should be at least 200ms (slow worker) + some overhead assert.True(t, elapsed >= 200*time.Millisecond, "expected shutdown to take at least 200ms (slowest worker), got %v", elapsed) @@ -472,9 +472,9 @@ func TestWorkerRegistry_Run_VariableShutdownTiming(t *testing.T) { "shutdown took too long: %v", elapsed) } -func TestWorkerRegistry_Run_VerySlowShutdown_NoTimeout(t *testing.T) { +func TestWorkerSupervisor_Run_VerySlowShutdown_NoTimeout(t *testing.T) { logger := newMockLogger() - registry := NewWorkerRegistry(logger) // No timeout + supervisor := NewWorkerSupervisor(logger) // No timeout // Worker that takes a very long time to shutdown (2 seconds) verySlowWorker := newMockWorker("very-slow", func(ctx context.Context) error { @@ -483,7 +483,7 @@ func TestWorkerRegistry_Run_VerySlowShutdown_NoTimeout(t *testing.T) { return nil }) - registry.Register(verySlowWorker) + supervisor.Register(verySlowWorker) ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -491,7 +491,7 @@ func TestWorkerRegistry_Run_VerySlowShutdown_NoTimeout(t *testing.T) { errChan := make(chan error, 1) start := time.Now() go func() { - errChan <- registry.Run(ctx) + errChan <- supervisor.Run(ctx) }() // Give worker time to start @@ -510,17 +510,17 @@ func TestWorkerRegistry_Run_VerySlowShutdown_NoTimeout(t *testing.T) { assert.True(t, elapsed >= 2*time.Second, "expected to wait at least 2s for slow worker, got %v", elapsed) - t.Logf("Registry waited %v for slow worker to shutdown gracefully (no timeout)", elapsed) + t.Logf("Supervisor waited %v for slow worker to shutdown gracefully (no timeout)", elapsed) case <-time.After(3 * time.Second): - t.Fatal("Registry.Run() blocked for more than 3 seconds") + t.Fatal("Supervisor.Run() blocked for more than 3 seconds") } } -func TestWorkerRegistry_Run_ShutdownTimeout(t *testing.T) { +func TestWorkerSupervisor_Run_ShutdownTimeout(t *testing.T) { logger := newMockLogger() // Set shutdown timeout to 500ms - registry := NewWorkerRegistry(logger, WithShutdownTimeout(500*time.Millisecond)) + supervisor := NewWorkerSupervisor(logger, WithShutdownTimeout(500*time.Millisecond)) // Worker that takes 2 seconds to shutdown (longer than timeout) slowWorker := newMockWorker("slow", func(ctx context.Context) error { @@ -529,7 +529,7 @@ func TestWorkerRegistry_Run_ShutdownTimeout(t *testing.T) { return nil }) - registry.Register(slowWorker) + supervisor.Register(slowWorker) ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -537,7 +537,7 @@ func TestWorkerRegistry_Run_ShutdownTimeout(t *testing.T) { errChan := make(chan error, 1) start := time.Now() go func() { - errChan <- registry.Run(ctx) + errChan <- supervisor.Run(ctx) }() // Give worker time to start @@ -559,13 +559,13 @@ func TestWorkerRegistry_Run_ShutdownTimeout(t *testing.T) { assert.True(t, elapsed < 1*time.Second, "expected to timeout before 1s, got %v", elapsed) - t.Logf("Registry timed out after %v (expected ~500ms)", elapsed) + t.Logf("Supervisor timed out after %v (expected ~500ms)", elapsed) } -func TestWorkerRegistry_Run_ShutdownTimeout_FastWorkers(t *testing.T) { +func TestWorkerSupervisor_Run_ShutdownTimeout_FastWorkers(t *testing.T) { logger := newMockLogger() // Set shutdown timeout to 2s - registry := NewWorkerRegistry(logger, WithShutdownTimeout(2*time.Second)) + supervisor := NewWorkerSupervisor(logger, WithShutdownTimeout(2*time.Second)) // Workers that shutdown quickly (100ms) fastWorker := newMockWorker("fast", func(ctx context.Context) error { @@ -574,7 +574,7 @@ func TestWorkerRegistry_Run_ShutdownTimeout_FastWorkers(t *testing.T) { return nil }) - registry.Register(fastWorker) + supervisor.Register(fastWorker) ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -582,7 +582,7 @@ func TestWorkerRegistry_Run_ShutdownTimeout_FastWorkers(t *testing.T) { errChan := make(chan error, 1) start := time.Now() go func() { - errChan <- registry.Run(ctx) + errChan <- supervisor.Run(ctx) }() // Give worker time to start @@ -603,12 +603,12 @@ func TestWorkerRegistry_Run_ShutdownTimeout_FastWorkers(t *testing.T) { assert.True(t, elapsed < 500*time.Millisecond, "shutdown took too long: %v", elapsed) - t.Logf("Registry shutdown in %v (workers finished before timeout)", elapsed) + t.Logf("Supervisor shutdown in %v (workers finished before timeout)", elapsed) } -func TestWorkerRegistry_Run_StuckWorker(t *testing.T) { +func TestWorkerSupervisor_Run_StuckWorker(t *testing.T) { logger := newMockLogger() - registry := NewWorkerRegistry(logger) + supervisor := NewWorkerSupervisor(logger) // Worker that never shuts down (ignores context cancellation) stuckWorker := newMockWorker("stuck", func(ctx context.Context) error { @@ -616,14 +616,14 @@ func TestWorkerRegistry_Run_StuckWorker(t *testing.T) { select {} }) - registry.Register(stuckWorker) + supervisor.Register(stuckWorker) ctx, cancel := context.WithCancel(context.Background()) defer cancel() errChan := make(chan error, 1) go func() { - errChan <- registry.Run(ctx) + errChan <- supervisor.Run(ctx) }() // Give worker time to start @@ -632,13 +632,13 @@ func TestWorkerRegistry_Run_StuckWorker(t *testing.T) { // Cancel context cancel() - // Verify that registry blocks indefinitely waiting for stuck worker + // Verify that supervisor blocks indefinitely waiting for stuck worker select { case <-errChan: - t.Fatal("Registry.Run() returned but worker is stuck - should block forever") + t.Fatal("Supervisor.Run() returned but worker is stuck - should block forever") case <-time.After(500 * time.Millisecond): - // Expected: registry is still waiting for stuck worker - t.Log("Registry correctly blocks waiting for stuck worker (this is expected behavior)") + // Expected: supervisor is still waiting for stuck worker + t.Log("Supervisor correctly blocks waiting for stuck worker (this is expected behavior)") } // Note: In production, this is why orchestrators need timeouts for pod termination From ac111922ce165a3117031bd6324ce37fd2a7ed23 Mon Sep 17 00:00:00 2001 From: Alex Luong Date: Tue, 11 Nov 2025 00:51:28 +0700 Subject: [PATCH 04/19] feat: service builder --- internal/app/app.go | 112 ++-- internal/services/builder.go | 604 ++++++++++++++++++++ internal/services/deliverymq_worker.go | 64 +++ internal/services/http_worker.go | 64 +++ internal/services/logmq_worker.go | 64 +++ internal/services/publishmq_worker.go | 65 +++ internal/services/retry_scheduler_worker.go | 42 ++ 7 files changed, 935 insertions(+), 80 deletions(-) create mode 100644 internal/services/builder.go create mode 100644 internal/services/deliverymq_worker.go create mode 100644 internal/services/http_worker.go create mode 100644 internal/services/logmq_worker.go create mode 100644 internal/services/publishmq_worker.go create mode 100644 internal/services/retry_scheduler_worker.go diff --git a/internal/app/app.go b/internal/app/app.go index 5879f20b..09f1f34c 100644 --- a/internal/app/app.go +++ b/internal/app/app.go @@ -6,7 +6,6 @@ import ( "os" "os/signal" "strings" - "sync" "syscall" "time" @@ -17,9 +16,7 @@ import ( "github.com/hookdeck/outpost/internal/migrator" "github.com/hookdeck/outpost/internal/otel" "github.com/hookdeck/outpost/internal/redis" - "github.com/hookdeck/outpost/internal/services/api" - "github.com/hookdeck/outpost/internal/services/delivery" - "github.com/hookdeck/outpost/internal/services/log" + "github.com/hookdeck/outpost/internal/services" "github.com/hookdeck/outpost/internal/telemetry" "go.uber.org/zap" ) @@ -106,14 +103,14 @@ func run(mainContext context.Context, cfg *config.Config) error { telemetry.Init(mainContext) telemetry.ApplicationStarted(mainContext, cfg.ToTelemetryApplicationInfo()) - // Set up cancellation context and waitgroup + // Set up cancellation context ctx, cancel := context.WithCancel(mainContext) + defer cancel() // Set up OpenTelemetry. if cfg.OpenTelemetry.ToConfig() != nil { otelShutdown, err := otel.SetupOTelSDK(ctx, cfg.OpenTelemetry.ToConfig()) if err != nil { - cancel() return err } // Handle shutdown properly so nothing leaks. @@ -122,100 +119,55 @@ func run(mainContext context.Context, cfg *config.Config) error { }() } - // Initialize waitgroup - // Once all services are done, we can exit. - // Each service will wait for the context to be cancelled before shutting down. - wg := &sync.WaitGroup{} - - // Construct services based on config - logger.Debug("constructing services") - services, err := constructServices( - ctx, - cfg, - wg, - logger, - telemetry, - ) - if err != nil { - logger.Error("service construction failed", zap.Error(err)) - cancel() + // Build services using ServiceBuilder + logger.Debug("building services") + builder := services.NewServiceBuilder(ctx, cfg, logger, telemetry) + + if err := builder.BuildWorkers(); err != nil { + logger.Error("failed to build workers", zap.Error(err)) return err } - // Start services - logger.Info("starting services", zap.Int("count", len(services))) - for _, service := range services { - go service.Run(ctx) + supervisor, err := builder.Build() + if err != nil { + logger.Error("failed to build worker supervisor", zap.Error(err)) + return err } // Handle sigterm and await termChan signal termChan := make(chan os.Signal, 1) signal.Notify(termChan, syscall.SIGINT, syscall.SIGTERM) - // Wait for either context cancellation or termination signal + // Run workers in goroutine + errChan := make(chan error, 1) + go func() { + errChan <- supervisor.Run(ctx) + }() + + // Wait for either termination signal or worker failure select { case <-termChan: - logger.Ctx(ctx).Info("shutdown signal received") - case <-ctx.Done(): - logger.Ctx(ctx).Info("context cancelled") + logger.Info("shutdown signal received") + cancel() // Cancel context to trigger graceful shutdown + <-errChan // Wait for workers to finish + case err := <-errChan: + if err != nil { + logger.Error("worker error", zap.Error(err)) + } } telemetry.Flush() - // Handle shutdown - cancel() // Signal cancellation to context.Context - wg.Wait() // Block here until all workers are done + // Run cleanup functions + shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 10*time.Second) + defer shutdownCancel() + builder.Cleanup(shutdownCtx) - logger.Ctx(ctx).Info("outpost shutdown complete") + logger.Info("outpost shutdown complete") return nil } -type Service interface { - Run(ctx context.Context) error -} - -func constructServices( - ctx context.Context, - cfg *config.Config, - wg *sync.WaitGroup, - logger *logging.Logger, - telemetry telemetry.Telemetry, -) ([]Service, error) { - serviceType := cfg.MustGetService() - services := []Service{} - - if serviceType == config.ServiceTypeAPI || serviceType == config.ServiceTypeAll { - logger.Debug("creating API service") - service, err := api.NewService(ctx, wg, cfg, logger, telemetry) - if err != nil { - logger.Error("API service creation failed", zap.Error(err)) - return nil, err - } - services = append(services, service) - } - if serviceType == config.ServiceTypeDelivery || serviceType == config.ServiceTypeAll { - logger.Debug("creating delivery service") - service, err := delivery.NewService(ctx, wg, cfg, logger, nil) - if err != nil { - logger.Error("delivery service creation failed", zap.Error(err)) - return nil, err - } - services = append(services, service) - } - if serviceType == config.ServiceTypeLog || serviceType == config.ServiceTypeAll { - logger.Debug("creating log service") - service, err := log.NewService(ctx, wg, cfg, logger, nil) - if err != nil { - logger.Error("log service creation failed", zap.Error(err)) - return nil, err - } - services = append(services, service) - } - - return services, nil -} - // runMigration handles database schema migrations with retry logic for lock conflicts. // // MIGRATION LOCK BEHAVIOR: diff --git a/internal/services/builder.go b/internal/services/builder.go new file mode 100644 index 00000000..33cc53f0 --- /dev/null +++ b/internal/services/builder.go @@ -0,0 +1,604 @@ +package services + +import ( + "context" + "fmt" + "net/http" + "time" + + "github.com/hookdeck/outpost/internal/alert" + apirouter "github.com/hookdeck/outpost/internal/apirouter" + "github.com/hookdeck/outpost/internal/config" + "github.com/hookdeck/outpost/internal/deliverymq" + "github.com/hookdeck/outpost/internal/destregistry" + destregistrydefault "github.com/hookdeck/outpost/internal/destregistry/providers" + "github.com/hookdeck/outpost/internal/eventtracer" + "github.com/hookdeck/outpost/internal/idempotence" + "github.com/hookdeck/outpost/internal/logging" + "github.com/hookdeck/outpost/internal/logmq" + "github.com/hookdeck/outpost/internal/logstore" + "github.com/hookdeck/outpost/internal/models" + "github.com/hookdeck/outpost/internal/mqs" + "github.com/hookdeck/outpost/internal/publishmq" + "github.com/hookdeck/outpost/internal/redis" + "github.com/hookdeck/outpost/internal/telemetry" + "github.com/hookdeck/outpost/internal/worker" + "github.com/mikestefanello/batcher" + "go.uber.org/zap" +) + +// ServiceBuilder constructs workers based on service configuration. +type ServiceBuilder struct { + ctx context.Context + cfg *config.Config + logger *logging.Logger + telemetry telemetry.Telemetry + supervisor *worker.WorkerSupervisor + + // Track service instances for cleanup + services []*serviceInstance +} + +// serviceInstance represents a single service with its cleanup functions +type serviceInstance struct { + name string + cleanupFuncs []func(context.Context, *logging.LoggerWithCtx) +} + +// NewServiceBuilder creates a new ServiceBuilder. +func NewServiceBuilder(ctx context.Context, cfg *config.Config, logger *logging.Logger, telemetry telemetry.Telemetry) *ServiceBuilder { + return &ServiceBuilder{ + ctx: ctx, + cfg: cfg, + logger: logger, + telemetry: telemetry, + supervisor: worker.NewWorkerSupervisor(logger), + services: []*serviceInstance{}, + } +} + +// BuildAPIWorkers creates and registers all workers for the API service. +// This sets up the infrastructure and creates 3 workers: +// 1. HTTP server +// 2. Retry scheduler +// 3. PublishMQ consumer (optional) +func (b *ServiceBuilder) BuildAPIWorkers() error { + b.logger.Debug("building API service workers") + + // Create a new service instance for API + svc := &serviceInstance{ + name: "api", + cleanupFuncs: []func(context.Context, *logging.LoggerWithCtx){}, + } + b.services = append(b.services, svc) + + // Initialize destination registry + b.logger.Debug("initializing destination registry") + registry := destregistry.NewRegistry(&destregistry.Config{ + DestinationMetadataPath: b.cfg.Destinations.MetadataPath, + DeliveryTimeout: time.Duration(b.cfg.DeliveryTimeoutSeconds) * time.Second, + }, b.logger) + if err := destregistrydefault.RegisterDefault(registry, b.cfg.Destinations.ToConfig(b.cfg)); err != nil { + b.logger.Error("destination registry setup failed", zap.Error(err)) + return err + } + + // Initialize delivery MQ + b.logger.Debug("configuring delivery message queue") + deliveryQueueConfig, err := b.cfg.MQs.ToQueueConfig(b.ctx, "deliverymq") + if err != nil { + b.logger.Error("delivery queue configuration failed", zap.Error(err)) + return err + } + + b.logger.Debug("initializing delivery MQ connection", zap.String("mq_type", b.cfg.MQs.GetInfraType())) + deliveryMQ := deliverymq.New(deliverymq.WithQueue(deliveryQueueConfig)) + cleanupDeliveryMQ, err := deliveryMQ.Init(b.ctx) + if err != nil { + b.logger.Error("delivery MQ initialization failed", zap.Error(err)) + return err + } + svc.cleanupFuncs = append(svc.cleanupFuncs, func(ctx context.Context, logger *logging.LoggerWithCtx) { cleanupDeliveryMQ() }) + + // Initialize Redis + b.logger.Debug("initializing Redis client for API service") + redisClient, err := redis.New(b.ctx, b.cfg.Redis.ToConfig()) + if err != nil { + b.logger.Error("Redis client initialization failed in API service", zap.Error(err)) + return err + } + + // Initialize log store + b.logger.Debug("configuring log store driver") + logStoreDriverOpts, err := logstore.MakeDriverOpts(logstore.Config{ + Postgres: &b.cfg.PostgresURL, + }) + if err != nil { + b.logger.Error("log store driver configuration failed", zap.Error(err)) + return err + } + svc.cleanupFuncs = append(svc.cleanupFuncs, func(ctx context.Context, logger *logging.LoggerWithCtx) { + logStoreDriverOpts.Close() + }) + + b.logger.Debug("creating log store") + logStore, err := logstore.NewLogStore(b.ctx, logStoreDriverOpts) + if err != nil { + b.logger.Error("log store creation failed", zap.Error(err)) + return err + } + + // Initialize event tracer + b.logger.Debug("setting up event tracer") + var eventTracer eventtracer.EventTracer + if b.cfg.OpenTelemetry.ToConfig() == nil { + eventTracer = eventtracer.NewNoopEventTracer() + } else { + eventTracer = eventtracer.NewEventTracer() + } + + // Initialize entity store + b.logger.Debug("creating entity store with Redis client") + entityStore := models.NewEntityStore(redisClient, + models.WithCipher(models.NewAESCipher(b.cfg.AESEncryptionSecret)), + models.WithAvailableTopics(b.cfg.Topics), + models.WithMaxDestinationsPerTenant(b.cfg.MaxDestinationsPerTenant), + models.WithDeploymentID(b.cfg.DeploymentID), + ) + + // Initialize event handler and router + b.logger.Debug("creating event handler and router") + publishIdempotence := idempotence.New(redisClient, + idempotence.WithTimeout(5*time.Second), + idempotence.WithSuccessfulTTL(time.Duration(b.cfg.PublishIdempotencyKeyTTL)*time.Second), + ) + eventHandler := publishmq.NewEventHandler(b.logger, deliveryMQ, entityStore, eventTracer, b.cfg.Topics, publishIdempotence) + router := apirouter.NewRouter( + apirouter.RouterConfig{ + ServiceName: b.cfg.OpenTelemetry.GetServiceName(), + APIKey: b.cfg.APIKey, + JWTSecret: b.cfg.APIJWTSecret, + Topics: b.cfg.Topics, + Registry: registry, + PortalConfig: b.cfg.GetPortalConfig(), + GinMode: b.cfg.GinMode, + }, + b.logger, + redisClient, + deliveryMQ, + entityStore, + logStore, + eventHandler, + b.telemetry, + ) + + // Initialize retry scheduler + b.logger.Debug("creating delivery MQ retry scheduler") + deliverymqRetryScheduler, err := deliverymq.NewRetryScheduler(deliveryMQ, b.cfg.Redis.ToConfig(), b.cfg.DeploymentID, b.logger) + if err != nil { + b.logger.Error("failed to create delivery MQ retry scheduler", zap.Error(err)) + return err + } + b.logger.Debug("initializing delivery MQ retry scheduler") + if err := deliverymqRetryScheduler.Init(b.ctx); err != nil { + b.logger.Error("delivery MQ retry scheduler initialization failed", zap.Error(err)) + return err + } + svc.cleanupFuncs = append(svc.cleanupFuncs, func(ctx context.Context, logger *logging.LoggerWithCtx) { + deliverymqRetryScheduler.Shutdown() + }) + + // Create HTTP server + b.logger.Debug("creating HTTP server") + httpServer := &http.Server{ + Addr: fmt.Sprintf(":%d", b.cfg.APIPort), + Handler: router, + } + svc.cleanupFuncs = append(svc.cleanupFuncs, func(ctx context.Context, logger *logging.LoggerWithCtx) { + if err := httpServer.Shutdown(ctx); err != nil { + logger.Error("error shutting down http server", zap.Error(err)) + } + logger.Info("http server shut down") + }) + + // Worker 1: HTTP Server + httpWorker := NewHTTPServerWorker(httpServer, b.logger) + b.supervisor.Register(httpWorker) + + // Worker 2: Retry Scheduler + retryWorker := NewRetrySchedulerWorker(deliverymqRetryScheduler, b.logger) + b.supervisor.Register(retryWorker) + + // Worker 3: PublishMQ Consumer (optional) + if b.cfg.PublishMQ.GetQueueConfig() != nil { + publishMQ := publishmq.New(publishmq.WithQueue(b.cfg.PublishMQ.GetQueueConfig())) + publishMQWorker := NewPublishMQWorker( + publishMQ, + eventHandler, + b.cfg.PublishMaxConcurrency, + b.logger, + ) + b.supervisor.Register(publishMQWorker) + } + + b.logger.Info("API service workers built successfully") + return nil +} + +// BuildDeliveryWorker creates and registers the delivery worker. +func (b *ServiceBuilder) BuildDeliveryWorker() error { + b.logger.Debug("building delivery service worker") + + // Create a new service instance for Delivery + svc := &serviceInstance{ + name: "delivery", + cleanupFuncs: []func(context.Context, *logging.LoggerWithCtx){}, + } + b.services = append(b.services, svc) + + // Initialize Redis + b.logger.Debug("initializing Redis client for delivery service") + redisClient, err := redis.New(b.ctx, b.cfg.Redis.ToConfig()) + if err != nil { + b.logger.Error("Redis client initialization failed in delivery service", zap.Error(err)) + return err + } + + // Initialize LogMQ + b.logger.Debug("configuring log message queue") + logQueueConfig, err := b.cfg.MQs.ToQueueConfig(b.ctx, "logmq") + if err != nil { + b.logger.Error("log queue configuration failed", zap.Error(err)) + return err + } + + b.logger.Debug("initializing log MQ connection", zap.String("mq_type", b.cfg.MQs.GetInfraType())) + logMQ := logmq.New(logmq.WithQueue(logQueueConfig)) + cleanupLogMQ, err := logMQ.Init(b.ctx) + if err != nil { + b.logger.Error("log MQ initialization failed", zap.Error(err)) + return err + } + svc.cleanupFuncs = append(svc.cleanupFuncs, func(ctx context.Context, logger *logging.LoggerWithCtx) { cleanupLogMQ() }) + + // Initialize DeliveryMQ + b.logger.Debug("configuring delivery message queue") + deliveryQueueConfig, err := b.cfg.MQs.ToQueueConfig(b.ctx, "deliverymq") + if err != nil { + b.logger.Error("delivery queue configuration failed", zap.Error(err)) + return err + } + + b.logger.Debug("initializing delivery MQ connection", zap.String("mq_type", b.cfg.MQs.GetInfraType())) + deliveryMQ := deliverymq.New(deliverymq.WithQueue(deliveryQueueConfig)) + cleanupDeliveryMQ, err := deliveryMQ.Init(b.ctx) + if err != nil { + b.logger.Error("delivery MQ initialization failed", zap.Error(err)) + return err + } + svc.cleanupFuncs = append(svc.cleanupFuncs, func(ctx context.Context, logger *logging.LoggerWithCtx) { cleanupDeliveryMQ() }) + + // Initialize destination registry + b.logger.Debug("initializing destination registry") + registry := destregistry.NewRegistry(&destregistry.Config{ + DestinationMetadataPath: b.cfg.Destinations.MetadataPath, + DeliveryTimeout: time.Duration(b.cfg.DeliveryTimeoutSeconds) * time.Second, + }, b.logger) + if err := destregistrydefault.RegisterDefault(registry, b.cfg.Destinations.ToConfig(b.cfg)); err != nil { + b.logger.Error("destination registry setup failed", zap.Error(err)) + return err + } + + // Initialize event tracer + b.logger.Debug("setting up event tracer") + var eventTracer eventtracer.EventTracer + if b.cfg.OpenTelemetry.ToConfig() == nil { + eventTracer = eventtracer.NewNoopEventTracer() + } else { + eventTracer = eventtracer.NewEventTracer() + } + + // Initialize entity store + b.logger.Debug("creating entity store with Redis client") + entityStore := models.NewEntityStore(redisClient, + models.WithCipher(models.NewAESCipher(b.cfg.AESEncryptionSecret)), + models.WithAvailableTopics(b.cfg.Topics), + models.WithMaxDestinationsPerTenant(b.cfg.MaxDestinationsPerTenant), + models.WithDeploymentID(b.cfg.DeploymentID), + ) + + // Initialize log store + b.logger.Debug("configuring log store driver") + logStoreDriverOpts, err := logstore.MakeDriverOpts(logstore.Config{ + Postgres: &b.cfg.PostgresURL, + }) + if err != nil { + b.logger.Error("log store driver configuration failed", zap.Error(err)) + return err + } + svc.cleanupFuncs = append(svc.cleanupFuncs, func(ctx context.Context, logger *logging.LoggerWithCtx) { + logStoreDriverOpts.Close() + }) + + b.logger.Debug("creating log store") + logStore, err := logstore.NewLogStore(b.ctx, logStoreDriverOpts) + if err != nil { + b.logger.Error("log store creation failed", zap.Error(err)) + return err + } + + // Initialize retry scheduler + b.logger.Debug("creating delivery MQ retry scheduler") + retryScheduler, err := deliverymq.NewRetryScheduler(deliveryMQ, b.cfg.Redis.ToConfig(), b.cfg.DeploymentID, b.logger) + if err != nil { + b.logger.Error("failed to create delivery MQ retry scheduler", zap.Error(err)) + return err + } + b.logger.Debug("initializing delivery MQ retry scheduler") + if err := retryScheduler.Init(b.ctx); err != nil { + b.logger.Error("delivery MQ retry scheduler initialization failed", zap.Error(err)) + return err + } + svc.cleanupFuncs = append(svc.cleanupFuncs, func(ctx context.Context, logger *logging.LoggerWithCtx) { + retryScheduler.Shutdown() + }) + + // Initialize alert monitor + var alertNotifier alert.AlertNotifier + var destinationDisabler alert.DestinationDisabler + if b.cfg.Alert.CallbackURL != "" { + alertNotifier = alert.NewHTTPAlertNotifier(b.cfg.Alert.CallbackURL, alert.NotifierWithBearerToken(b.cfg.APIKey)) + } + if b.cfg.Alert.AutoDisableDestination { + destinationDisabler = newDestinationDisabler(entityStore) + } + alertMonitor := alert.NewAlertMonitor( + b.logger, + redisClient, + alert.WithNotifier(alertNotifier), + alert.WithDisabler(destinationDisabler), + alert.WithAutoDisableFailureCount(b.cfg.Alert.ConsecutiveFailureCount), + alert.WithDeploymentID(b.cfg.DeploymentID), + ) + + // Initialize delivery idempotence + deliveryIdempotence := idempotence.New(redisClient, + idempotence.WithTimeout(5*time.Second), + idempotence.WithSuccessfulTTL(time.Duration(b.cfg.DeliveryIdempotencyKeyTTL)*time.Second), + ) + + // Get retry configuration + retryBackoff, retryMaxLimit := b.cfg.GetRetryBackoff() + + // Create delivery handler + handler := deliverymq.NewMessageHandler( + b.logger, + logMQ, + entityStore, + logStore, + registry, + eventTracer, + retryScheduler, + retryBackoff, + retryMaxLimit, + alertMonitor, + deliveryIdempotence, + ) + + // Create DeliveryMQ worker + deliveryWorker := NewDeliveryMQWorker( + deliveryMQ, + handler, + b.cfg.DeliveryMaxConcurrency, + b.logger, + ) + b.supervisor.Register(deliveryWorker) + + b.logger.Info("delivery service worker built successfully") + return nil +} + +// BuildLogWorker creates and registers the log worker. +func (b *ServiceBuilder) BuildLogWorker() error { + b.logger.Debug("building log service worker") + + // Create a new service instance for Log + svc := &serviceInstance{ + name: "log", + cleanupFuncs: []func(context.Context, *logging.LoggerWithCtx){}, + } + b.services = append(b.services, svc) + + // Initialize log store + b.logger.Debug("configuring log store driver") + logStoreDriverOpts, err := logstore.MakeDriverOpts(logstore.Config{ + Postgres: &b.cfg.PostgresURL, + }) + if err != nil { + b.logger.Error("log store driver configuration failed", zap.Error(err)) + return err + } + svc.cleanupFuncs = append(svc.cleanupFuncs, func(ctx context.Context, logger *logging.LoggerWithCtx) { + logStoreDriverOpts.Close() + }) + + b.logger.Debug("creating log store") + logStore, err := logstore.NewLogStore(b.ctx, logStoreDriverOpts) + if err != nil { + b.logger.Error("log store creation failed", zap.Error(err)) + return err + } + + // Create batcher for batching log writes + batcherCfg := struct { + ItemCountThreshold int + DelayThreshold time.Duration + }{ + ItemCountThreshold: b.cfg.LogBatchSize, + DelayThreshold: time.Duration(b.cfg.LogBatchThresholdSeconds) * time.Second, + } + + b.logger.Debug("creating log batcher") + batcher, err := b.makeBatcher(logStore, batcherCfg.ItemCountThreshold, batcherCfg.DelayThreshold) + if err != nil { + b.logger.Error("failed to create batcher", zap.Error(err)) + return err + } + svc.cleanupFuncs = append(svc.cleanupFuncs, func(ctx context.Context, logger *logging.LoggerWithCtx) { + batcher.Shutdown() + }) + + // Create log handler with batcher + handler := logmq.NewMessageHandler(b.logger, &handlerBatcherImpl{batcher: batcher}) + + // Initialize LogMQ + b.logger.Debug("configuring log message queue") + logQueueConfig, err := b.cfg.MQs.ToQueueConfig(b.ctx, "logmq") + if err != nil { + b.logger.Error("log queue configuration failed", zap.Error(err)) + return err + } + + logMQ := logmq.New(logmq.WithQueue(logQueueConfig)) + + // Create LogMQ worker + logWorker := NewLogMQWorker( + logMQ, + handler, + b.cfg.DeliveryMaxConcurrency, + b.logger, + ) + b.supervisor.Register(logWorker) + + b.logger.Info("log service worker built successfully") + return nil +} + +// BuildWorkers builds workers based on the configured service type. +func (b *ServiceBuilder) BuildWorkers() error { + serviceType := b.cfg.MustGetService() + b.logger.Debug("building workers for service type", zap.String("service_type", serviceType.String())) + + if serviceType == config.ServiceTypeAPI || serviceType == config.ServiceTypeAll { + if err := b.BuildAPIWorkers(); err != nil { + b.logger.Error("failed to build API workers", zap.Error(err)) + return err + } + } + if serviceType == config.ServiceTypeDelivery || serviceType == config.ServiceTypeAll { + if err := b.BuildDeliveryWorker(); err != nil { + b.logger.Error("failed to build delivery worker", zap.Error(err)) + return err + } + } + if serviceType == config.ServiceTypeLog || serviceType == config.ServiceTypeAll { + if err := b.BuildLogWorker(); err != nil { + b.logger.Error("failed to build log worker", zap.Error(err)) + return err + } + } + + return nil +} + +// Build returns the configured WorkerSupervisor. +func (b *ServiceBuilder) Build() (*worker.WorkerSupervisor, error) { + return b.supervisor, nil +} + +// Cleanup runs all registered cleanup functions for all services. +func (b *ServiceBuilder) Cleanup(ctx context.Context) { + logger := b.logger.Ctx(ctx) + for _, svc := range b.services { + logger.Debug("cleaning up service", zap.String("service", svc.name)) + for _, cleanupFunc := range svc.cleanupFuncs { + cleanupFunc(ctx, &logger) + } + } +} + +// destinationDisabler implements alert.DestinationDisabler +type destinationDisabler struct { + entityStore models.EntityStore +} + +func newDestinationDisabler(entityStore models.EntityStore) alert.DestinationDisabler { + return &destinationDisabler{ + entityStore: entityStore, + } +} + +func (d *destinationDisabler) DisableDestination(ctx context.Context, tenantID, destinationID string) error { + destination, err := d.entityStore.RetrieveDestination(ctx, tenantID, destinationID) + if err != nil { + return err + } + if destination == nil { + return nil + } + now := time.Now() + destination.DisabledAt = &now + return d.entityStore.UpsertDestination(ctx, *destination) +} + +// makeBatcher creates a batcher for batching log writes +func (b *ServiceBuilder) makeBatcher(logStore logstore.LogStore, itemCountThreshold int, delayThreshold time.Duration) (*batcher.Batcher[*mqs.Message], error) { + batchr, err := batcher.NewBatcher(batcher.Config[*mqs.Message]{ + GroupCountThreshold: 2, + ItemCountThreshold: itemCountThreshold, + DelayThreshold: delayThreshold, + NumGoroutines: 1, + Processor: func(_ string, msgs []*mqs.Message) { + logger := b.logger.Ctx(b.ctx) + logger.Info("processing batch", zap.Int("message_count", len(msgs))) + + nackAll := func() { + for _, msg := range msgs { + msg.Nack() + } + } + + deliveryEvents := make([]*models.DeliveryEvent, 0, len(msgs)) + for _, msg := range msgs { + deliveryEvent := models.DeliveryEvent{} + if err := deliveryEvent.FromMessage(msg); err != nil { + logger.Error("failed to parse delivery event", + zap.Error(err), + zap.String("message_id", msg.LoggableID)) + nackAll() + return + } + deliveryEvents = append(deliveryEvents, &deliveryEvent) + } + + if err := logStore.InsertManyDeliveryEvent(b.ctx, deliveryEvents); err != nil { + logger.Error("failed to insert delivery events", + zap.Error(err), + zap.Int("count", len(deliveryEvents))) + nackAll() + return + } + + logger.Info("batch processed successfully", zap.Int("count", len(msgs))) + + for _, msg := range msgs { + msg.Ack() + } + }, + }) + if err != nil { + b.logger.Ctx(b.ctx).Error("failed to create batcher", zap.Error(err)) + return nil, err + } + return batchr, nil +} + +// handlerBatcherImpl implements the batcher interface expected by logmq.MessageHandler +type handlerBatcherImpl struct { + batcher *batcher.Batcher[*mqs.Message] +} + +func (hb *handlerBatcherImpl) Add(ctx context.Context, msg *mqs.Message) error { + hb.batcher.Add("", msg) + return nil +} diff --git a/internal/services/deliverymq_worker.go b/internal/services/deliverymq_worker.go new file mode 100644 index 00000000..a0684b25 --- /dev/null +++ b/internal/services/deliverymq_worker.go @@ -0,0 +1,64 @@ +package services + +import ( + "context" + "errors" + + "github.com/hookdeck/outpost/internal/consumer" + "github.com/hookdeck/outpost/internal/deliverymq" + "github.com/hookdeck/outpost/internal/logging" + "github.com/hookdeck/outpost/internal/worker" + "go.uber.org/zap" +) + +// DeliveryMQWorker wraps a DeliveryMQ consumer as a worker. +type DeliveryMQWorker struct { + deliveryMQ *deliverymq.DeliveryMQ + handler consumer.MessageHandler + concurrency int + logger *logging.Logger +} + +// NewDeliveryMQWorker creates a new DeliveryMQ consumer worker. +func NewDeliveryMQWorker( + deliveryMQ *deliverymq.DeliveryMQ, + handler consumer.MessageHandler, + concurrency int, + logger *logging.Logger, +) worker.Worker { + return &DeliveryMQWorker{ + deliveryMQ: deliveryMQ, + handler: handler, + concurrency: concurrency, + logger: logger, + } +} + +// Name returns the worker name. +func (w *DeliveryMQWorker) Name() string { + return "deliverymq-consumer" +} + +// Run starts the DeliveryMQ consumer and blocks until context is cancelled or it fails. +func (w *DeliveryMQWorker) Run(ctx context.Context) error { + logger := w.logger.Ctx(ctx) + logger.Info("deliverymq consumer running") + + subscription, err := w.deliveryMQ.Subscribe(ctx) + if err != nil { + logger.Error("error subscribing to deliverymq", zap.Error(err)) + return err + } + + csm := consumer.New(subscription, w.handler, + consumer.WithName("deliverymq"), + consumer.WithConcurrency(w.concurrency), + ) + + if err := csm.Run(ctx); !errors.Is(err, ctx.Err()) { + logger.Error("error running deliverymq consumer", zap.Error(err)) + return err + } + + return nil +} diff --git a/internal/services/http_worker.go b/internal/services/http_worker.go new file mode 100644 index 00000000..b34297d6 --- /dev/null +++ b/internal/services/http_worker.go @@ -0,0 +1,64 @@ +package services + +import ( + "context" + "net/http" + "time" + + "github.com/hookdeck/outpost/internal/logging" + "github.com/hookdeck/outpost/internal/worker" + "go.uber.org/zap" +) + +// HTTPServerWorker wraps an HTTP server as a worker. +type HTTPServerWorker struct { + server *http.Server + logger *logging.Logger +} + +// NewHTTPServerWorker creates a new HTTP server worker. +func NewHTTPServerWorker(server *http.Server, logger *logging.Logger) worker.Worker { + return &HTTPServerWorker{ + server: server, + logger: logger, + } +} + +// Name returns the worker name. +func (w *HTTPServerWorker) Name() string { + return "http-server" +} + +// Run starts the HTTP server and blocks until context is cancelled or server fails. +func (w *HTTPServerWorker) Run(ctx context.Context) error { + logger := w.logger.Ctx(ctx) + logger.Info("http server listening", zap.String("addr", w.server.Addr)) + + // Start server in goroutine + errChan := make(chan error, 1) + go func() { + if err := w.server.ListenAndServe(); err != nil && err != http.ErrServerClosed { + errChan <- err + } + }() + + // Wait for context cancellation or server error + select { + case <-ctx.Done(): + // Graceful shutdown + logger.Info("shutting down http server") + shutdownCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + if err := w.server.Shutdown(shutdownCtx); err != nil { + logger.Error("error shutting down http server", zap.Error(err)) + return err + } + logger.Info("http server shut down") + return nil + + case err := <-errChan: + logger.Error("http server error", zap.Error(err)) + return err + } +} diff --git a/internal/services/logmq_worker.go b/internal/services/logmq_worker.go new file mode 100644 index 00000000..871b4f38 --- /dev/null +++ b/internal/services/logmq_worker.go @@ -0,0 +1,64 @@ +package services + +import ( + "context" + "errors" + + "github.com/hookdeck/outpost/internal/consumer" + "github.com/hookdeck/outpost/internal/logging" + "github.com/hookdeck/outpost/internal/logmq" + "github.com/hookdeck/outpost/internal/worker" + "go.uber.org/zap" +) + +// LogMQWorker wraps a LogMQ consumer as a worker. +type LogMQWorker struct { + logMQ *logmq.LogMQ + handler consumer.MessageHandler + concurrency int + logger *logging.Logger +} + +// NewLogMQWorker creates a new LogMQ consumer worker. +func NewLogMQWorker( + logMQ *logmq.LogMQ, + handler consumer.MessageHandler, + concurrency int, + logger *logging.Logger, +) worker.Worker { + return &LogMQWorker{ + logMQ: logMQ, + handler: handler, + concurrency: concurrency, + logger: logger, + } +} + +// Name returns the worker name. +func (w *LogMQWorker) Name() string { + return "logmq-consumer" +} + +// Run starts the LogMQ consumer and blocks until context is cancelled or it fails. +func (w *LogMQWorker) Run(ctx context.Context) error { + logger := w.logger.Ctx(ctx) + logger.Info("logmq consumer running") + + subscription, err := w.logMQ.Subscribe(ctx) + if err != nil { + logger.Error("error subscribing to logmq", zap.Error(err)) + return err + } + + csm := consumer.New(subscription, w.handler, + consumer.WithName("logmq"), + consumer.WithConcurrency(w.concurrency), + ) + + if err := csm.Run(ctx); !errors.Is(err, ctx.Err()) { + logger.Error("error running logmq consumer", zap.Error(err)) + return err + } + + return nil +} diff --git a/internal/services/publishmq_worker.go b/internal/services/publishmq_worker.go new file mode 100644 index 00000000..0b77b30b --- /dev/null +++ b/internal/services/publishmq_worker.go @@ -0,0 +1,65 @@ +package services + +import ( + "context" + "errors" + + "github.com/hookdeck/outpost/internal/consumer" + "github.com/hookdeck/outpost/internal/logging" + "github.com/hookdeck/outpost/internal/publishmq" + "github.com/hookdeck/outpost/internal/worker" + "go.uber.org/zap" +) + +// PublishMQWorker wraps a PublishMQ consumer as a worker. +type PublishMQWorker struct { + publishMQ *publishmq.PublishMQ + eventHandler publishmq.EventHandler + concurrency int + logger *logging.Logger +} + +// NewPublishMQWorker creates a new PublishMQ consumer worker. +func NewPublishMQWorker( + publishMQ *publishmq.PublishMQ, + eventHandler publishmq.EventHandler, + concurrency int, + logger *logging.Logger, +) worker.Worker { + return &PublishMQWorker{ + publishMQ: publishMQ, + eventHandler: eventHandler, + concurrency: concurrency, + logger: logger, + } +} + +// Name returns the worker name. +func (w *PublishMQWorker) Name() string { + return "publishmq-consumer" +} + +// Run starts the PublishMQ consumer and blocks until context is cancelled or it fails. +func (w *PublishMQWorker) Run(ctx context.Context) error { + logger := w.logger.Ctx(ctx) + logger.Info("publishmq consumer running") + + subscription, err := w.publishMQ.Subscribe(ctx) + if err != nil { + logger.Error("error subscribing to publishmq", zap.Error(err)) + return err + } + + messageHandler := publishmq.NewMessageHandler(w.eventHandler) + csm := consumer.New(subscription, messageHandler, + consumer.WithName("publishmq"), + consumer.WithConcurrency(w.concurrency), + ) + + if err := csm.Run(ctx); !errors.Is(err, ctx.Err()) { + logger.Error("error running publishmq consumer", zap.Error(err)) + return err + } + + return nil +} diff --git a/internal/services/retry_scheduler_worker.go b/internal/services/retry_scheduler_worker.go new file mode 100644 index 00000000..ac1515e4 --- /dev/null +++ b/internal/services/retry_scheduler_worker.go @@ -0,0 +1,42 @@ +package services + +import ( + "context" + + "github.com/hookdeck/outpost/internal/logging" + "github.com/hookdeck/outpost/internal/scheduler" + "github.com/hookdeck/outpost/internal/worker" + "go.uber.org/zap" +) + +// RetrySchedulerWorker wraps a retry scheduler as a worker. +type RetrySchedulerWorker struct { + scheduler scheduler.Scheduler + logger *logging.Logger +} + +// NewRetrySchedulerWorker creates a new retry scheduler worker. +func NewRetrySchedulerWorker(scheduler scheduler.Scheduler, logger *logging.Logger) worker.Worker { + return &RetrySchedulerWorker{ + scheduler: scheduler, + logger: logger, + } +} + +// Name returns the worker name. +func (w *RetrySchedulerWorker) Name() string { + return "retry-scheduler" +} + +// Run starts the retry scheduler monitor and blocks until context is cancelled or it fails. +func (w *RetrySchedulerWorker) Run(ctx context.Context) error { + logger := w.logger.Ctx(ctx) + logger.Info("retry scheduler monitor running") + + if err := w.scheduler.Monitor(ctx); err != nil { + logger.Error("retry scheduler monitor error", zap.Error(err)) + return err + } + + return nil +} From 47c2335700b814c9e3a60559b4268f6703718968 Mon Sep 17 00:00:00 2001 From: Alex Luong Date: Tue, 11 Nov 2025 01:04:53 +0700 Subject: [PATCH 05/19] chore: refactor common deps --- internal/app/app.go | 9 +- internal/services/builder.go | 471 +++++++++++++++++------------------ 2 files changed, 228 insertions(+), 252 deletions(-) diff --git a/internal/app/app.go b/internal/app/app.go index 09f1f34c..d2f11e81 100644 --- a/internal/app/app.go +++ b/internal/app/app.go @@ -123,14 +123,9 @@ func run(mainContext context.Context, cfg *config.Config) error { logger.Debug("building services") builder := services.NewServiceBuilder(ctx, cfg, logger, telemetry) - if err := builder.BuildWorkers(); err != nil { - logger.Error("failed to build workers", zap.Error(err)) - return err - } - - supervisor, err := builder.Build() + supervisor, err := builder.BuildWorkers() if err != nil { - logger.Error("failed to build worker supervisor", zap.Error(err)) + logger.Error("failed to build workers", zap.Error(err)) return err } diff --git a/internal/services/builder.go b/internal/services/builder.go index 33cc53f0..31590ef4 100644 --- a/internal/services/builder.go +++ b/internal/services/builder.go @@ -21,6 +21,7 @@ import ( "github.com/hookdeck/outpost/internal/mqs" "github.com/hookdeck/outpost/internal/publishmq" "github.com/hookdeck/outpost/internal/redis" + "github.com/hookdeck/outpost/internal/scheduler" "github.com/hookdeck/outpost/internal/telemetry" "github.com/hookdeck/outpost/internal/worker" "github.com/mikestefanello/batcher" @@ -39,10 +40,20 @@ type ServiceBuilder struct { services []*serviceInstance } -// serviceInstance represents a single service with its cleanup functions +// serviceInstance represents a single service with its cleanup functions and common dependencies type serviceInstance struct { name string cleanupFuncs []func(context.Context, *logging.LoggerWithCtx) + + // Common infrastructure + redisClient redis.Client + logStore logstore.LogStore + entityStore models.EntityStore + destRegistry destregistry.Registry + eventTracer eventtracer.EventTracer + deliveryMQ *deliverymq.DeliveryMQ + logMQ *logmq.LogMQ + retryScheduler scheduler.Scheduler } // NewServiceBuilder creates a new ServiceBuilder. @@ -57,6 +68,44 @@ func NewServiceBuilder(ctx context.Context, cfg *config.Config, logger *logging. } } +// BuildWorkers builds workers based on the configured service type and returns the supervisor. +func (b *ServiceBuilder) BuildWorkers() (*worker.WorkerSupervisor, error) { + serviceType := b.cfg.MustGetService() + b.logger.Debug("building workers for service type", zap.String("service_type", serviceType.String())) + + if serviceType == config.ServiceTypeAPI || serviceType == config.ServiceTypeAll { + if err := b.BuildAPIWorkers(); err != nil { + b.logger.Error("failed to build API workers", zap.Error(err)) + return nil, err + } + } + if serviceType == config.ServiceTypeDelivery || serviceType == config.ServiceTypeAll { + if err := b.BuildDeliveryWorker(); err != nil { + b.logger.Error("failed to build delivery worker", zap.Error(err)) + return nil, err + } + } + if serviceType == config.ServiceTypeLog || serviceType == config.ServiceTypeAll { + if err := b.BuildLogWorker(); err != nil { + b.logger.Error("failed to build log worker", zap.Error(err)) + return nil, err + } + } + + return b.supervisor, nil +} + +// Cleanup runs all registered cleanup functions for all services. +func (b *ServiceBuilder) Cleanup(ctx context.Context) { + logger := b.logger.Ctx(ctx) + for _, svc := range b.services { + logger.Debug("cleaning up service", zap.String("service", svc.name)) + for _, cleanupFunc := range svc.cleanupFuncs { + cleanupFunc(ctx, &logger) + } + } +} + // BuildAPIWorkers creates and registers all workers for the API service. // This sets up the infrastructure and creates 3 workers: // 1. HTTP server @@ -72,122 +121,57 @@ func (b *ServiceBuilder) BuildAPIWorkers() error { } b.services = append(b.services, svc) - // Initialize destination registry - b.logger.Debug("initializing destination registry") - registry := destregistry.NewRegistry(&destregistry.Config{ - DestinationMetadataPath: b.cfg.Destinations.MetadataPath, - DeliveryTimeout: time.Duration(b.cfg.DeliveryTimeoutSeconds) * time.Second, - }, b.logger) - if err := destregistrydefault.RegisterDefault(registry, b.cfg.Destinations.ToConfig(b.cfg)); err != nil { - b.logger.Error("destination registry setup failed", zap.Error(err)) + // Initialize common infrastructure + if err := svc.initDestRegistry(b.cfg, b.logger); err != nil { return err } - - // Initialize delivery MQ - b.logger.Debug("configuring delivery message queue") - deliveryQueueConfig, err := b.cfg.MQs.ToQueueConfig(b.ctx, "deliverymq") - if err != nil { - b.logger.Error("delivery queue configuration failed", zap.Error(err)) + if err := svc.initDeliveryMQ(b.ctx, b.cfg, b.logger); err != nil { return err } - - b.logger.Debug("initializing delivery MQ connection", zap.String("mq_type", b.cfg.MQs.GetInfraType())) - deliveryMQ := deliverymq.New(deliverymq.WithQueue(deliveryQueueConfig)) - cleanupDeliveryMQ, err := deliveryMQ.Init(b.ctx) - if err != nil { - b.logger.Error("delivery MQ initialization failed", zap.Error(err)) + if err := svc.initRedis(b.ctx, b.cfg, b.logger); err != nil { return err } - svc.cleanupFuncs = append(svc.cleanupFuncs, func(ctx context.Context, logger *logging.LoggerWithCtx) { cleanupDeliveryMQ() }) - - // Initialize Redis - b.logger.Debug("initializing Redis client for API service") - redisClient, err := redis.New(b.ctx, b.cfg.Redis.ToConfig()) - if err != nil { - b.logger.Error("Redis client initialization failed in API service", zap.Error(err)) + if err := svc.initLogStore(b.ctx, b.cfg, b.logger); err != nil { return err } - - // Initialize log store - b.logger.Debug("configuring log store driver") - logStoreDriverOpts, err := logstore.MakeDriverOpts(logstore.Config{ - Postgres: &b.cfg.PostgresURL, - }) - if err != nil { - b.logger.Error("log store driver configuration failed", zap.Error(err)) + if err := svc.initEventTracer(b.cfg, b.logger); err != nil { return err } - svc.cleanupFuncs = append(svc.cleanupFuncs, func(ctx context.Context, logger *logging.LoggerWithCtx) { - logStoreDriverOpts.Close() - }) - - b.logger.Debug("creating log store") - logStore, err := logstore.NewLogStore(b.ctx, logStoreDriverOpts) - if err != nil { - b.logger.Error("log store creation failed", zap.Error(err)) + if err := svc.initEntityStore(b.cfg, b.logger); err != nil { return err } - // Initialize event tracer - b.logger.Debug("setting up event tracer") - var eventTracer eventtracer.EventTracer - if b.cfg.OpenTelemetry.ToConfig() == nil { - eventTracer = eventtracer.NewNoopEventTracer() - } else { - eventTracer = eventtracer.NewEventTracer() + // Initialize retry scheduler + if err := svc.initRetryScheduler(b.ctx, b.cfg, b.logger); err != nil { + return err } - // Initialize entity store - b.logger.Debug("creating entity store with Redis client") - entityStore := models.NewEntityStore(redisClient, - models.WithCipher(models.NewAESCipher(b.cfg.AESEncryptionSecret)), - models.WithAvailableTopics(b.cfg.Topics), - models.WithMaxDestinationsPerTenant(b.cfg.MaxDestinationsPerTenant), - models.WithDeploymentID(b.cfg.DeploymentID), - ) - // Initialize event handler and router b.logger.Debug("creating event handler and router") - publishIdempotence := idempotence.New(redisClient, + publishIdempotence := idempotence.New(svc.redisClient, idempotence.WithTimeout(5*time.Second), idempotence.WithSuccessfulTTL(time.Duration(b.cfg.PublishIdempotencyKeyTTL)*time.Second), ) - eventHandler := publishmq.NewEventHandler(b.logger, deliveryMQ, entityStore, eventTracer, b.cfg.Topics, publishIdempotence) + eventHandler := publishmq.NewEventHandler(b.logger, svc.deliveryMQ, svc.entityStore, svc.eventTracer, b.cfg.Topics, publishIdempotence) router := apirouter.NewRouter( apirouter.RouterConfig{ ServiceName: b.cfg.OpenTelemetry.GetServiceName(), APIKey: b.cfg.APIKey, JWTSecret: b.cfg.APIJWTSecret, Topics: b.cfg.Topics, - Registry: registry, + Registry: svc.destRegistry, PortalConfig: b.cfg.GetPortalConfig(), GinMode: b.cfg.GinMode, }, b.logger, - redisClient, - deliveryMQ, - entityStore, - logStore, + svc.redisClient, + svc.deliveryMQ, + svc.entityStore, + svc.logStore, eventHandler, b.telemetry, ) - // Initialize retry scheduler - b.logger.Debug("creating delivery MQ retry scheduler") - deliverymqRetryScheduler, err := deliverymq.NewRetryScheduler(deliveryMQ, b.cfg.Redis.ToConfig(), b.cfg.DeploymentID, b.logger) - if err != nil { - b.logger.Error("failed to create delivery MQ retry scheduler", zap.Error(err)) - return err - } - b.logger.Debug("initializing delivery MQ retry scheduler") - if err := deliverymqRetryScheduler.Init(b.ctx); err != nil { - b.logger.Error("delivery MQ retry scheduler initialization failed", zap.Error(err)) - return err - } - svc.cleanupFuncs = append(svc.cleanupFuncs, func(ctx context.Context, logger *logging.LoggerWithCtx) { - deliverymqRetryScheduler.Shutdown() - }) - // Create HTTP server b.logger.Debug("creating HTTP server") httpServer := &http.Server{ @@ -206,7 +190,7 @@ func (b *ServiceBuilder) BuildAPIWorkers() error { b.supervisor.Register(httpWorker) // Worker 2: Retry Scheduler - retryWorker := NewRetrySchedulerWorker(deliverymqRetryScheduler, b.logger) + retryWorker := NewRetrySchedulerWorker(svc.retryScheduler, b.logger) b.supervisor.Register(retryWorker) // Worker 3: PublishMQ Consumer (optional) @@ -236,112 +220,31 @@ func (b *ServiceBuilder) BuildDeliveryWorker() error { } b.services = append(b.services, svc) - // Initialize Redis - b.logger.Debug("initializing Redis client for delivery service") - redisClient, err := redis.New(b.ctx, b.cfg.Redis.ToConfig()) - if err != nil { - b.logger.Error("Redis client initialization failed in delivery service", zap.Error(err)) - return err - } - - // Initialize LogMQ - b.logger.Debug("configuring log message queue") - logQueueConfig, err := b.cfg.MQs.ToQueueConfig(b.ctx, "logmq") - if err != nil { - b.logger.Error("log queue configuration failed", zap.Error(err)) - return err - } - - b.logger.Debug("initializing log MQ connection", zap.String("mq_type", b.cfg.MQs.GetInfraType())) - logMQ := logmq.New(logmq.WithQueue(logQueueConfig)) - cleanupLogMQ, err := logMQ.Init(b.ctx) - if err != nil { - b.logger.Error("log MQ initialization failed", zap.Error(err)) + // Initialize common infrastructure + if err := svc.initRedis(b.ctx, b.cfg, b.logger); err != nil { return err } - svc.cleanupFuncs = append(svc.cleanupFuncs, func(ctx context.Context, logger *logging.LoggerWithCtx) { cleanupLogMQ() }) - - // Initialize DeliveryMQ - b.logger.Debug("configuring delivery message queue") - deliveryQueueConfig, err := b.cfg.MQs.ToQueueConfig(b.ctx, "deliverymq") - if err != nil { - b.logger.Error("delivery queue configuration failed", zap.Error(err)) + if err := svc.initLogMQ(b.ctx, b.cfg, b.logger); err != nil { return err } - - b.logger.Debug("initializing delivery MQ connection", zap.String("mq_type", b.cfg.MQs.GetInfraType())) - deliveryMQ := deliverymq.New(deliverymq.WithQueue(deliveryQueueConfig)) - cleanupDeliveryMQ, err := deliveryMQ.Init(b.ctx) - if err != nil { - b.logger.Error("delivery MQ initialization failed", zap.Error(err)) + if err := svc.initDeliveryMQ(b.ctx, b.cfg, b.logger); err != nil { return err } - svc.cleanupFuncs = append(svc.cleanupFuncs, func(ctx context.Context, logger *logging.LoggerWithCtx) { cleanupDeliveryMQ() }) - - // Initialize destination registry - b.logger.Debug("initializing destination registry") - registry := destregistry.NewRegistry(&destregistry.Config{ - DestinationMetadataPath: b.cfg.Destinations.MetadataPath, - DeliveryTimeout: time.Duration(b.cfg.DeliveryTimeoutSeconds) * time.Second, - }, b.logger) - if err := destregistrydefault.RegisterDefault(registry, b.cfg.Destinations.ToConfig(b.cfg)); err != nil { - b.logger.Error("destination registry setup failed", zap.Error(err)) + if err := svc.initDestRegistry(b.cfg, b.logger); err != nil { return err } - - // Initialize event tracer - b.logger.Debug("setting up event tracer") - var eventTracer eventtracer.EventTracer - if b.cfg.OpenTelemetry.ToConfig() == nil { - eventTracer = eventtracer.NewNoopEventTracer() - } else { - eventTracer = eventtracer.NewEventTracer() - } - - // Initialize entity store - b.logger.Debug("creating entity store with Redis client") - entityStore := models.NewEntityStore(redisClient, - models.WithCipher(models.NewAESCipher(b.cfg.AESEncryptionSecret)), - models.WithAvailableTopics(b.cfg.Topics), - models.WithMaxDestinationsPerTenant(b.cfg.MaxDestinationsPerTenant), - models.WithDeploymentID(b.cfg.DeploymentID), - ) - - // Initialize log store - b.logger.Debug("configuring log store driver") - logStoreDriverOpts, err := logstore.MakeDriverOpts(logstore.Config{ - Postgres: &b.cfg.PostgresURL, - }) - if err != nil { - b.logger.Error("log store driver configuration failed", zap.Error(err)) + if err := svc.initEventTracer(b.cfg, b.logger); err != nil { return err } - svc.cleanupFuncs = append(svc.cleanupFuncs, func(ctx context.Context, logger *logging.LoggerWithCtx) { - logStoreDriverOpts.Close() - }) - - b.logger.Debug("creating log store") - logStore, err := logstore.NewLogStore(b.ctx, logStoreDriverOpts) - if err != nil { - b.logger.Error("log store creation failed", zap.Error(err)) + if err := svc.initEntityStore(b.cfg, b.logger); err != nil { return err } - - // Initialize retry scheduler - b.logger.Debug("creating delivery MQ retry scheduler") - retryScheduler, err := deliverymq.NewRetryScheduler(deliveryMQ, b.cfg.Redis.ToConfig(), b.cfg.DeploymentID, b.logger) - if err != nil { - b.logger.Error("failed to create delivery MQ retry scheduler", zap.Error(err)) + if err := svc.initLogStore(b.ctx, b.cfg, b.logger); err != nil { return err } - b.logger.Debug("initializing delivery MQ retry scheduler") - if err := retryScheduler.Init(b.ctx); err != nil { - b.logger.Error("delivery MQ retry scheduler initialization failed", zap.Error(err)) + if err := svc.initRetryScheduler(b.ctx, b.cfg, b.logger); err != nil { return err } - svc.cleanupFuncs = append(svc.cleanupFuncs, func(ctx context.Context, logger *logging.LoggerWithCtx) { - retryScheduler.Shutdown() - }) // Initialize alert monitor var alertNotifier alert.AlertNotifier @@ -350,11 +253,11 @@ func (b *ServiceBuilder) BuildDeliveryWorker() error { alertNotifier = alert.NewHTTPAlertNotifier(b.cfg.Alert.CallbackURL, alert.NotifierWithBearerToken(b.cfg.APIKey)) } if b.cfg.Alert.AutoDisableDestination { - destinationDisabler = newDestinationDisabler(entityStore) + destinationDisabler = newDestinationDisabler(svc.entityStore) } alertMonitor := alert.NewAlertMonitor( b.logger, - redisClient, + svc.redisClient, alert.WithNotifier(alertNotifier), alert.WithDisabler(destinationDisabler), alert.WithAutoDisableFailureCount(b.cfg.Alert.ConsecutiveFailureCount), @@ -362,7 +265,7 @@ func (b *ServiceBuilder) BuildDeliveryWorker() error { ) // Initialize delivery idempotence - deliveryIdempotence := idempotence.New(redisClient, + deliveryIdempotence := idempotence.New(svc.redisClient, idempotence.WithTimeout(5*time.Second), idempotence.WithSuccessfulTTL(time.Duration(b.cfg.DeliveryIdempotencyKeyTTL)*time.Second), ) @@ -373,12 +276,12 @@ func (b *ServiceBuilder) BuildDeliveryWorker() error { // Create delivery handler handler := deliverymq.NewMessageHandler( b.logger, - logMQ, - entityStore, - logStore, - registry, - eventTracer, - retryScheduler, + svc.logMQ, + svc.entityStore, + svc.logStore, + svc.destRegistry, + svc.eventTracer, + svc.retryScheduler, retryBackoff, retryMaxLimit, alertMonitor, @@ -387,7 +290,7 @@ func (b *ServiceBuilder) BuildDeliveryWorker() error { // Create DeliveryMQ worker deliveryWorker := NewDeliveryMQWorker( - deliveryMQ, + svc.deliveryMQ, handler, b.cfg.DeliveryMaxConcurrency, b.logger, @@ -409,23 +312,8 @@ func (b *ServiceBuilder) BuildLogWorker() error { } b.services = append(b.services, svc) - // Initialize log store - b.logger.Debug("configuring log store driver") - logStoreDriverOpts, err := logstore.MakeDriverOpts(logstore.Config{ - Postgres: &b.cfg.PostgresURL, - }) - if err != nil { - b.logger.Error("log store driver configuration failed", zap.Error(err)) - return err - } - svc.cleanupFuncs = append(svc.cleanupFuncs, func(ctx context.Context, logger *logging.LoggerWithCtx) { - logStoreDriverOpts.Close() - }) - - b.logger.Debug("creating log store") - logStore, err := logstore.NewLogStore(b.ctx, logStoreDriverOpts) - if err != nil { - b.logger.Error("log store creation failed", zap.Error(err)) + // Initialize common infrastructure + if err := svc.initLogStore(b.ctx, b.cfg, b.logger); err != nil { return err } @@ -439,7 +327,7 @@ func (b *ServiceBuilder) BuildLogWorker() error { } b.logger.Debug("creating log batcher") - batcher, err := b.makeBatcher(logStore, batcherCfg.ItemCountThreshold, batcherCfg.DelayThreshold) + batcher, err := b.makeBatcher(svc.logStore, batcherCfg.ItemCountThreshold, batcherCfg.DelayThreshold) if err != nil { b.logger.Error("failed to create batcher", zap.Error(err)) return err @@ -474,49 +362,6 @@ func (b *ServiceBuilder) BuildLogWorker() error { return nil } -// BuildWorkers builds workers based on the configured service type. -func (b *ServiceBuilder) BuildWorkers() error { - serviceType := b.cfg.MustGetService() - b.logger.Debug("building workers for service type", zap.String("service_type", serviceType.String())) - - if serviceType == config.ServiceTypeAPI || serviceType == config.ServiceTypeAll { - if err := b.BuildAPIWorkers(); err != nil { - b.logger.Error("failed to build API workers", zap.Error(err)) - return err - } - } - if serviceType == config.ServiceTypeDelivery || serviceType == config.ServiceTypeAll { - if err := b.BuildDeliveryWorker(); err != nil { - b.logger.Error("failed to build delivery worker", zap.Error(err)) - return err - } - } - if serviceType == config.ServiceTypeLog || serviceType == config.ServiceTypeAll { - if err := b.BuildLogWorker(); err != nil { - b.logger.Error("failed to build log worker", zap.Error(err)) - return err - } - } - - return nil -} - -// Build returns the configured WorkerSupervisor. -func (b *ServiceBuilder) Build() (*worker.WorkerSupervisor, error) { - return b.supervisor, nil -} - -// Cleanup runs all registered cleanup functions for all services. -func (b *ServiceBuilder) Cleanup(ctx context.Context) { - logger := b.logger.Ctx(ctx) - for _, svc := range b.services { - logger.Debug("cleaning up service", zap.String("service", svc.name)) - for _, cleanupFunc := range svc.cleanupFuncs { - cleanupFunc(ctx, &logger) - } - } -} - // destinationDisabler implements alert.DestinationDisabler type destinationDisabler struct { entityStore models.EntityStore @@ -602,3 +447,139 @@ func (hb *handlerBatcherImpl) Add(ctx context.Context, msg *mqs.Message) error { hb.batcher.Add("", msg) return nil } + +// Helper methods for serviceInstance to initialize common dependencies + +func (s *serviceInstance) initRedis(ctx context.Context, cfg *config.Config, logger *logging.Logger) error { + logger.Debug("initializing Redis client", zap.String("service", s.name)) + redisClient, err := redis.New(ctx, cfg.Redis.ToConfig()) + if err != nil { + logger.Error("Redis client initialization failed", zap.String("service", s.name), zap.Error(err)) + return err + } + s.redisClient = redisClient + return nil +} + +func (s *serviceInstance) initLogStore(ctx context.Context, cfg *config.Config, logger *logging.Logger) error { + logger.Debug("configuring log store driver", zap.String("service", s.name)) + logStoreDriverOpts, err := logstore.MakeDriverOpts(logstore.Config{ + Postgres: &cfg.PostgresURL, + }) + if err != nil { + logger.Error("log store driver configuration failed", zap.String("service", s.name), zap.Error(err)) + return err + } + s.cleanupFuncs = append(s.cleanupFuncs, func(ctx context.Context, logger *logging.LoggerWithCtx) { + logStoreDriverOpts.Close() + }) + + logger.Debug("creating log store", zap.String("service", s.name)) + logStore, err := logstore.NewLogStore(ctx, logStoreDriverOpts) + if err != nil { + logger.Error("log store creation failed", zap.String("service", s.name), zap.Error(err)) + return err + } + s.logStore = logStore + return nil +} + +func (s *serviceInstance) initEntityStore(cfg *config.Config, logger *logging.Logger) error { + if s.redisClient == nil { + return fmt.Errorf("redis client must be initialized before entity store") + } + logger.Debug("creating entity store", zap.String("service", s.name)) + s.entityStore = models.NewEntityStore(s.redisClient, + models.WithCipher(models.NewAESCipher(cfg.AESEncryptionSecret)), + models.WithAvailableTopics(cfg.Topics), + models.WithMaxDestinationsPerTenant(cfg.MaxDestinationsPerTenant), + models.WithDeploymentID(cfg.DeploymentID), + ) + return nil +} + +func (s *serviceInstance) initDestRegistry(cfg *config.Config, logger *logging.Logger) error { + logger.Debug("initializing destination registry", zap.String("service", s.name)) + registry := destregistry.NewRegistry(&destregistry.Config{ + DestinationMetadataPath: cfg.Destinations.MetadataPath, + DeliveryTimeout: time.Duration(cfg.DeliveryTimeoutSeconds) * time.Second, + }, logger) + if err := destregistrydefault.RegisterDefault(registry, cfg.Destinations.ToConfig(cfg)); err != nil { + logger.Error("destination registry setup failed", zap.String("service", s.name), zap.Error(err)) + return err + } + s.destRegistry = registry + return nil +} + +func (s *serviceInstance) initEventTracer(cfg *config.Config, logger *logging.Logger) error { + logger.Debug("setting up event tracer", zap.String("service", s.name)) + if cfg.OpenTelemetry.ToConfig() == nil { + s.eventTracer = eventtracer.NewNoopEventTracer() + } else { + s.eventTracer = eventtracer.NewEventTracer() + } + return nil +} + +func (s *serviceInstance) initDeliveryMQ(ctx context.Context, cfg *config.Config, logger *logging.Logger) error { + logger.Debug("configuring delivery message queue", zap.String("service", s.name)) + deliveryQueueConfig, err := cfg.MQs.ToQueueConfig(ctx, "deliverymq") + if err != nil { + logger.Error("delivery queue configuration failed", zap.String("service", s.name), zap.Error(err)) + return err + } + + logger.Debug("initializing delivery MQ connection", zap.String("service", s.name), zap.String("mq_type", cfg.MQs.GetInfraType())) + deliveryMQ := deliverymq.New(deliverymq.WithQueue(deliveryQueueConfig)) + cleanupDeliveryMQ, err := deliveryMQ.Init(ctx) + if err != nil { + logger.Error("delivery MQ initialization failed", zap.String("service", s.name), zap.Error(err)) + return err + } + s.cleanupFuncs = append(s.cleanupFuncs, func(ctx context.Context, logger *logging.LoggerWithCtx) { cleanupDeliveryMQ() }) + s.deliveryMQ = deliveryMQ + return nil +} + +func (s *serviceInstance) initLogMQ(ctx context.Context, cfg *config.Config, logger *logging.Logger) error { + logger.Debug("configuring log message queue", zap.String("service", s.name)) + logQueueConfig, err := cfg.MQs.ToQueueConfig(ctx, "logmq") + if err != nil { + logger.Error("log queue configuration failed", zap.String("service", s.name), zap.Error(err)) + return err + } + + logger.Debug("initializing log MQ connection", zap.String("service", s.name), zap.String("mq_type", cfg.MQs.GetInfraType())) + logMQ := logmq.New(logmq.WithQueue(logQueueConfig)) + cleanupLogMQ, err := logMQ.Init(ctx) + if err != nil { + logger.Error("log MQ initialization failed", zap.String("service", s.name), zap.Error(err)) + return err + } + s.cleanupFuncs = append(s.cleanupFuncs, func(ctx context.Context, logger *logging.LoggerWithCtx) { cleanupLogMQ() }) + s.logMQ = logMQ + return nil +} + +func (s *serviceInstance) initRetryScheduler(ctx context.Context, cfg *config.Config, logger *logging.Logger) error { + if s.deliveryMQ == nil { + return fmt.Errorf("delivery MQ must be initialized before retry scheduler") + } + logger.Debug("creating delivery MQ retry scheduler", zap.String("service", s.name)) + retryScheduler, err := deliverymq.NewRetryScheduler(s.deliveryMQ, cfg.Redis.ToConfig(), cfg.DeploymentID, logger) + if err != nil { + logger.Error("failed to create delivery MQ retry scheduler", zap.String("service", s.name), zap.Error(err)) + return err + } + logger.Debug("initializing delivery MQ retry scheduler", zap.String("service", s.name)) + if err := retryScheduler.Init(ctx); err != nil { + logger.Error("delivery MQ retry scheduler initialization failed", zap.String("service", s.name), zap.Error(err)) + return err + } + s.cleanupFuncs = append(s.cleanupFuncs, func(ctx context.Context, logger *logging.LoggerWithCtx) { + retryScheduler.Shutdown() + }) + s.retryScheduler = retryScheduler + return nil +} From e07c71d196bac20a05067807afc77ee8aa12340d Mon Sep 17 00:00:00 2001 From: Alex Luong Date: Tue, 11 Nov 2025 01:05:08 +0700 Subject: [PATCH 06/19] chore: gofmt --- internal/apirouter/logger_middleware_integration_test.go | 2 +- internal/app/app.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/apirouter/logger_middleware_integration_test.go b/internal/apirouter/logger_middleware_integration_test.go index dd92a71a..2d44ea86 100644 --- a/internal/apirouter/logger_middleware_integration_test.go +++ b/internal/apirouter/logger_middleware_integration_test.go @@ -11,11 +11,11 @@ import ( "testing" "github.com/gin-gonic/gin" + "github.com/hookdeck/outpost/internal/apirouter" "github.com/hookdeck/outpost/internal/destregistry" "github.com/hookdeck/outpost/internal/destregistry/metadata" "github.com/hookdeck/outpost/internal/logging" "github.com/hookdeck/outpost/internal/models" - "github.com/hookdeck/outpost/internal/apirouter" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "github.com/uptrace/opentelemetry-go-extra/otelzap" diff --git a/internal/app/app.go b/internal/app/app.go index d2f11e81..e33ad9d4 100644 --- a/internal/app/app.go +++ b/internal/app/app.go @@ -143,7 +143,7 @@ func run(mainContext context.Context, cfg *config.Config) error { select { case <-termChan: logger.Info("shutdown signal received") - cancel() // Cancel context to trigger graceful shutdown + cancel() // Cancel context to trigger graceful shutdown <-errChan // Wait for workers to finish case err := <-errChan: if err != nil { From b6527d2500c5ab804dae591000bc82720437d87e Mon Sep 17 00:00:00 2001 From: Alex Luong Date: Tue, 11 Nov 2025 02:33:42 +0700 Subject: [PATCH 07/19] fix: exit with proper error code --- cmd/e2e/suites_test.go | 5 +++++ internal/app/app.go | 16 ++++++++++++---- internal/worker/supervisor.go | 6 +++--- internal/worker/worker_test.go | 18 ++++++++++-------- 4 files changed, 30 insertions(+), 15 deletions(-) diff --git a/cmd/e2e/suites_test.go b/cmd/e2e/suites_test.go index d76471fe..9bb1f0bb 100644 --- a/cmd/e2e/suites_test.go +++ b/cmd/e2e/suites_test.go @@ -30,14 +30,17 @@ type e2eSuite struct { mockServerInfra *testinfra.MockServerInfra cleanup func() client httpclient.Client + appDone chan struct{} } func (suite *e2eSuite) SetupSuite() { ctx, cancel := context.WithCancel(context.Background()) suite.ctx = ctx suite.cancel = cancel + suite.appDone = make(chan struct{}) suite.client = httpclient.New(fmt.Sprintf("http://localhost:%d/api/v1", suite.config.APIPort), suite.config.APIKey) go func() { + defer close(suite.appDone) application := app.New(&suite.config) if err := application.Run(suite.ctx); err != nil { log.Println("Application failed to run", err) @@ -48,6 +51,8 @@ func (suite *e2eSuite) SetupSuite() { func (s *e2eSuite) TearDownSuite() { if s.cancel != nil { s.cancel() + // Wait for application to fully shut down before cleaning up resources + <-s.appDone } s.cleanup() } diff --git a/internal/app/app.go b/internal/app/app.go index e33ad9d4..9e29ec08 100644 --- a/internal/app/app.go +++ b/internal/app/app.go @@ -140,14 +140,22 @@ func run(mainContext context.Context, cfg *config.Config) error { }() // Wait for either termination signal or worker failure + var exitErr error select { case <-termChan: logger.Info("shutdown signal received") - cancel() // Cancel context to trigger graceful shutdown - <-errChan // Wait for workers to finish + cancel() // Cancel context to trigger graceful shutdown + err := <-errChan + // context.Canceled is expected during graceful shutdown + if err != nil && !errors.Is(err, context.Canceled) { + logger.Error("error during graceful shutdown", zap.Error(err)) + exitErr = err + } case err := <-errChan: + // Workers exited unexpectedly if err != nil { - logger.Error("worker error", zap.Error(err)) + logger.Error("workers exited unexpectedly", zap.Error(err)) + exitErr = err } } @@ -160,7 +168,7 @@ func run(mainContext context.Context, cfg *config.Config) error { logger.Info("outpost shutdown complete") - return nil + return exitErr } // runMigration handles database schema migrations with retry logic for lock conflicts. diff --git a/internal/worker/supervisor.go b/internal/worker/supervisor.go index 3ec0b489..3952ecd6 100644 --- a/internal/worker/supervisor.go +++ b/internal/worker/supervisor.go @@ -86,7 +86,7 @@ func (r *WorkerSupervisor) GetHealthTracker() *HealthTracker { func (r *WorkerSupervisor) Run(ctx context.Context) error { if len(r.workers) == 0 { r.logger.Warn("no workers registered") - return nil + return fmt.Errorf("no workers registered") } r.logger.Info("starting workers", zap.Int("count", len(r.workers))) @@ -129,11 +129,11 @@ func (r *WorkerSupervisor) Run(ctx context.Context) error { // No timeout - wait indefinitely wg.Wait() - return nil + return ctx.Err() case <-r.waitForWorkers(&wg): // All workers exited (either successfully or with errors) r.logger.Warn("all workers have exited") - return nil + return fmt.Errorf("all workers have exited unexpectedly") } } diff --git a/internal/worker/worker_test.go b/internal/worker/worker_test.go index a36f58ce..6f4f9320 100644 --- a/internal/worker/worker_test.go +++ b/internal/worker/worker_test.go @@ -238,7 +238,7 @@ func TestWorkerSupervisor_Run_HealthyWorkers(t *testing.T) { cancel() err := <-errChan - assert.NoError(t, err) + assert.ErrorIs(t, err, context.Canceled) } func TestWorkerSupervisor_Run_FailedWorker(t *testing.T) { @@ -289,7 +289,7 @@ func TestWorkerSupervisor_Run_FailedWorker(t *testing.T) { // Now cancel context and verify graceful shutdown cancel() err := <-errChan - assert.NoError(t, err) // Graceful shutdown should return nil + assert.ErrorIs(t, err, context.Canceled) // Graceful shutdown should return context.Canceled from ctx.Err() } func TestWorkerSupervisor_Run_AllWorkersExit(t *testing.T) { @@ -319,7 +319,8 @@ func TestWorkerSupervisor_Run_AllWorkersExit(t *testing.T) { // Wait for both workers to exit err := <-errChan - assert.NoError(t, err) // Should return nil when all workers exit + assert.Error(t, err) // Should return error when all workers exit unexpectedly + assert.Contains(t, err.Error(), "all workers have exited unexpectedly") // Verify both workers are marked as failed assert.False(t, supervisor.GetHealthTracker().IsHealthy()) @@ -360,7 +361,7 @@ func TestWorkerSupervisor_Run_ContextCanceled(t *testing.T) { cancel() err := <-errChan - assert.NoError(t, err) // context.Canceled is not treated as error + assert.ErrorIs(t, err, context.Canceled) // Should return context.Canceled } func TestWorkerSupervisor_Run_NoWorkers(t *testing.T) { @@ -370,7 +371,8 @@ func TestWorkerSupervisor_Run_NoWorkers(t *testing.T) { ctx := context.Background() err := supervisor.Run(ctx) - assert.NoError(t, err) + assert.Error(t, err) + assert.Contains(t, err.Error(), "no workers registered") assert.True(t, logger.Contains("no workers registered")) } @@ -460,7 +462,7 @@ func TestWorkerSupervisor_Run_VariableShutdownTiming(t *testing.T) { err := <-errChan elapsed := time.Since(start) - assert.NoError(t, err) + assert.ErrorIs(t, err, context.Canceled) // Supervisor should wait for the slowest worker (200ms) // Total time should be at least 200ms (slow worker) + some overhead @@ -504,7 +506,7 @@ func TestWorkerSupervisor_Run_VerySlowShutdown_NoTimeout(t *testing.T) { select { case err := <-errChan: elapsed := time.Since(start) - assert.NoError(t, err) + assert.ErrorIs(t, err, context.Canceled) // Should wait the full 2 seconds for worker to finish assert.True(t, elapsed >= 2*time.Second, @@ -594,7 +596,7 @@ func TestWorkerSupervisor_Run_ShutdownTimeout_FastWorkers(t *testing.T) { err := <-errChan elapsed := time.Since(start) - // Should NOT timeout since worker finishes quickly + // Should NOT timeout since worker finishes quickly (returns nil when timeout configured but not exceeded) assert.NoError(t, err) // Should return after ~100ms (worker shutdown time), not 2s (timeout) From 20fcb7aaff5a09cefe3d49199828e0c5a493d48a Mon Sep 17 00:00:00 2001 From: Alex Luong Date: Tue, 11 Nov 2025 02:47:02 +0700 Subject: [PATCH 08/19] feat: extend healthz endpoint to return full worker status --- cmd/e2e/api_test.go | 11 ++ internal/apirouter/auth_middleware_test.go | 6 +- internal/apirouter/logger_middleware.go | 6 +- internal/apirouter/router.go | 4 - internal/apirouter/router_test.go | 20 ---- internal/services/builder.go | 115 +++++++++++++++------ internal/services/health.go | 39 +++++++ 7 files changed, 138 insertions(+), 63 deletions(-) create mode 100644 internal/services/health.go diff --git a/cmd/e2e/api_test.go b/cmd/e2e/api_test.go index c0adc093..380444f9 100644 --- a/cmd/e2e/api_test.go +++ b/cmd/e2e/api_test.go @@ -23,6 +23,17 @@ func (suite *basicSuite) TestHealthzAPI() { Match: &httpclient.Response{ StatusCode: http.StatusOK, }, + Validate: map[string]interface{}{ + "type": "object", + "properties": map[string]interface{}{ + "status": map[string]interface{}{ + "type": "string", + }, + "workers": map[string]interface{}{ + "type": "object", + }, + }, + }, }, }, } diff --git a/internal/apirouter/auth_middleware_test.go b/internal/apirouter/auth_middleware_test.go index 60db3434..18fa43db 100644 --- a/internal/apirouter/auth_middleware_test.go +++ b/internal/apirouter/auth_middleware_test.go @@ -20,7 +20,7 @@ func TestPublicRouter(t *testing.T) { t.Run("should accept requests without a token", func(t *testing.T) { t.Parallel() w := httptest.NewRecorder() - req, _ := http.NewRequest("GET", baseAPIPath+"/healthz", nil) + req, _ := http.NewRequest("GET", baseAPIPath+"/tenant-id/topics", nil) router.ServeHTTP(w, req) assert.Equal(t, http.StatusOK, w.Code) }) @@ -28,7 +28,7 @@ func TestPublicRouter(t *testing.T) { t.Run("should accept requests with an invalid authorization token", func(t *testing.T) { t.Parallel() w := httptest.NewRecorder() - req, _ := http.NewRequest("GET", baseAPIPath+"/healthz", nil) + req, _ := http.NewRequest("GET", baseAPIPath+"/tenant-id/topics", nil) req.Header.Set("Authorization", "invalid key") router.ServeHTTP(w, req) assert.Equal(t, http.StatusOK, w.Code) @@ -37,7 +37,7 @@ func TestPublicRouter(t *testing.T) { t.Run("should accept requests with a valid authorization token", func(t *testing.T) { t.Parallel() w := httptest.NewRecorder() - req, _ := http.NewRequest("GET", baseAPIPath+"/healthz", nil) + req, _ := http.NewRequest("GET", baseAPIPath+"/tenant-id/topics", nil) req.Header.Set("Authorization", "Bearer key") router.ServeHTTP(w, req) assert.Equal(t, http.StatusOK, w.Code) diff --git a/internal/apirouter/logger_middleware.go b/internal/apirouter/logger_middleware.go index 0e0cdcf7..be3f72aa 100644 --- a/internal/apirouter/logger_middleware.go +++ b/internal/apirouter/logger_middleware.go @@ -60,11 +60,7 @@ func LoggerMiddlewareWithSanitizer(logger *logging.Logger, sanitizer *RequestBod hub.CaptureException(getErrorWithStackTrace(c.Errors.Last().Err)) } } else { - if strings.HasPrefix(c.Request.URL.Path, "/api") && strings.HasSuffix(c.Request.URL.Path, "/healthz") { - logger.Debug("healthz request completed", fields...) - } else { - logger.Info("request completed", fields...) - } + logger.Info("request completed", fields...) } } } diff --git a/internal/apirouter/router.go b/internal/apirouter/router.go index a0f3e009..3f96092e 100644 --- a/internal/apirouter/router.go +++ b/internal/apirouter/router.go @@ -148,10 +148,6 @@ func NewRouter( apiRouter := r.Group("/api/v1") apiRouter.Use(SetTenantIDMiddleware()) - apiRouter.GET("/healthz", func(c *gin.Context) { - c.String(http.StatusOK, "OK") - }) - tenantHandlers := NewTenantHandlers(logger, telemetry, cfg.JWTSecret, entityStore) destinationHandlers := NewDestinationHandlers(logger, telemetry, entityStore, cfg.Topics, cfg.Registry) publishHandlers := NewPublishHandlers(logger, publishmqEventHandler) diff --git a/internal/apirouter/router_test.go b/internal/apirouter/router_test.go index 8c14396b..fd2511eb 100644 --- a/internal/apirouter/router_test.go +++ b/internal/apirouter/router_test.go @@ -94,16 +94,6 @@ func TestRouterWithAPIKey(t *testing.T) { t.Fatal(err) } - t.Run("healthcheck should work", func(t *testing.T) { - t.Parallel() - - w := httptest.NewRecorder() - req, _ := http.NewRequest("GET", baseAPIPath+"/healthz", nil) - router.ServeHTTP(w, req) - - assert.Equal(t, http.StatusOK, w.Code) - }) - t.Run("should block unauthenticated request to admin routes", func(t *testing.T) { t.Parallel() @@ -207,16 +197,6 @@ func TestRouterWithoutAPIKey(t *testing.T) { t.Fatal(err) } - t.Run("healthcheck should work", func(t *testing.T) { - t.Parallel() - - w := httptest.NewRecorder() - req, _ := http.NewRequest("GET", baseAPIPath+"/healthz", nil) - router.ServeHTTP(w, req) - - assert.Equal(t, http.StatusOK, w.Code) - }) - t.Run("should allow unauthenticated request to admin routes", func(t *testing.T) { t.Parallel() diff --git a/internal/services/builder.go b/internal/services/builder.go index 31590ef4..a5a28a97 100644 --- a/internal/services/builder.go +++ b/internal/services/builder.go @@ -6,6 +6,7 @@ import ( "net/http" "time" + "github.com/gin-gonic/gin" "github.com/hookdeck/outpost/internal/alert" apirouter "github.com/hookdeck/outpost/internal/apirouter" "github.com/hookdeck/outpost/internal/config" @@ -54,6 +55,9 @@ type serviceInstance struct { deliveryMQ *deliverymq.DeliveryMQ logMQ *logmq.LogMQ retryScheduler scheduler.Scheduler + + // HTTP server and router + router http.Handler } // NewServiceBuilder creates a new ServiceBuilder. @@ -73,28 +77,81 @@ func (b *ServiceBuilder) BuildWorkers() (*worker.WorkerSupervisor, error) { serviceType := b.cfg.MustGetService() b.logger.Debug("building workers for service type", zap.String("service_type", serviceType.String())) + // Create base router with health check that all services will extend + b.logger.Debug("creating base router with health check") + baseRouter := NewBaseRouter(b.supervisor, b.cfg.GinMode) + if serviceType == config.ServiceTypeAPI || serviceType == config.ServiceTypeAll { - if err := b.BuildAPIWorkers(); err != nil { + if err := b.BuildAPIWorkers(baseRouter); err != nil { b.logger.Error("failed to build API workers", zap.Error(err)) return nil, err } } if serviceType == config.ServiceTypeDelivery || serviceType == config.ServiceTypeAll { - if err := b.BuildDeliveryWorker(); err != nil { + if err := b.BuildDeliveryWorker(baseRouter); err != nil { b.logger.Error("failed to build delivery worker", zap.Error(err)) return nil, err } } if serviceType == config.ServiceTypeLog || serviceType == config.ServiceTypeAll { - if err := b.BuildLogWorker(); err != nil { + if err := b.BuildLogWorker(baseRouter); err != nil { b.logger.Error("failed to build log worker", zap.Error(err)) return nil, err } } + // Create HTTP server with the base router + if err := b.createHTTPServer(baseRouter); err != nil { + b.logger.Error("failed to create HTTP server", zap.Error(err)) + return nil, err + } + return b.supervisor, nil } +// createHTTPServer creates and registers the HTTP server worker with the given router +func (b *ServiceBuilder) createHTTPServer(router http.Handler) error { + // Create HTTP server + b.logger.Debug("creating HTTP server") + httpServer := &http.Server{ + Addr: fmt.Sprintf(":%d", b.cfg.APIPort), + Handler: router, + } + + // Add cleanup for HTTP server to the first service (or API service for service=all) + serviceType := b.cfg.MustGetService() + var targetSvc *serviceInstance + if serviceType == config.ServiceTypeAll { + // For service=all, prefer API service for cleanup + for _, svc := range b.services { + if svc.name == "api" { + targetSvc = svc + break + } + } + } + if targetSvc == nil && len(b.services) > 0 { + // Fallback to first service + targetSvc = b.services[0] + } + + if targetSvc != nil { + targetSvc.cleanupFuncs = append(targetSvc.cleanupFuncs, func(ctx context.Context, logger *logging.LoggerWithCtx) { + if err := httpServer.Shutdown(ctx); err != nil { + logger.Error("error shutting down http server", zap.Error(err)) + } + logger.Info("http server shut down") + }) + } + + // Register HTTP server worker + httpWorker := NewHTTPServerWorker(httpServer, b.logger) + b.supervisor.Register(httpWorker) + + b.logger.Info("HTTP server created", zap.String("addr", httpServer.Addr)) + return nil +} + // Cleanup runs all registered cleanup functions for all services. func (b *ServiceBuilder) Cleanup(ctx context.Context) { logger := b.logger.Ctx(ctx) @@ -106,12 +163,12 @@ func (b *ServiceBuilder) Cleanup(ctx context.Context) { } } -// BuildAPIWorkers creates and registers all workers for the API service. -// This sets up the infrastructure and creates 3 workers: -// 1. HTTP server -// 2. Retry scheduler -// 3. PublishMQ consumer (optional) -func (b *ServiceBuilder) BuildAPIWorkers() error { +// BuildAPIWorkers creates the API router and registers workers for the API service. +// This sets up the infrastructure, creates the API router, and registers workers: +// 1. Retry scheduler +// 2. PublishMQ consumer (optional) +// The baseRouter parameter is extended with API routes (apirouter already has health check) +func (b *ServiceBuilder) BuildAPIWorkers(baseRouter *gin.Engine) error { b.logger.Debug("building API service workers") // Create a new service instance for API @@ -146,14 +203,16 @@ func (b *ServiceBuilder) BuildAPIWorkers() error { return err } - // Initialize event handler and router - b.logger.Debug("creating event handler and router") + // Initialize event handler and create API router + b.logger.Debug("creating event handler and API router") publishIdempotence := idempotence.New(svc.redisClient, idempotence.WithTimeout(5*time.Second), idempotence.WithSuccessfulTTL(time.Duration(b.cfg.PublishIdempotencyKeyTTL)*time.Second), ) eventHandler := publishmq.NewEventHandler(b.logger, svc.deliveryMQ, svc.entityStore, svc.eventTracer, b.cfg.Topics, publishIdempotence) - router := apirouter.NewRouter( + + // Create API router as separate handler + apiHandler := apirouter.NewRouter( apirouter.RouterConfig{ ServiceName: b.cfg.OpenTelemetry.GetServiceName(), APIKey: b.cfg.APIKey, @@ -172,28 +231,16 @@ func (b *ServiceBuilder) BuildAPIWorkers() error { b.telemetry, ) - // Create HTTP server - b.logger.Debug("creating HTTP server") - httpServer := &http.Server{ - Addr: fmt.Sprintf(":%d", b.cfg.APIPort), - Handler: router, - } - svc.cleanupFuncs = append(svc.cleanupFuncs, func(ctx context.Context, logger *logging.LoggerWithCtx) { - if err := httpServer.Shutdown(ctx); err != nil { - logger.Error("error shutting down http server", zap.Error(err)) - } - logger.Info("http server shut down") - }) + // Mount API handler onto base router (everything except /healthz goes to apiHandler) + baseRouter.NoRoute(gin.WrapH(apiHandler)) - // Worker 1: HTTP Server - httpWorker := NewHTTPServerWorker(httpServer, b.logger) - b.supervisor.Register(httpWorker) + svc.router = baseRouter - // Worker 2: Retry Scheduler + // Worker 1: Retry Scheduler retryWorker := NewRetrySchedulerWorker(svc.retryScheduler, b.logger) b.supervisor.Register(retryWorker) - // Worker 3: PublishMQ Consumer (optional) + // Worker 2: PublishMQ Consumer (optional) if b.cfg.PublishMQ.GetQueueConfig() != nil { publishMQ := publishmq.New(publishmq.WithQueue(b.cfg.PublishMQ.GetQueueConfig())) publishMQWorker := NewPublishMQWorker( @@ -210,7 +257,7 @@ func (b *ServiceBuilder) BuildAPIWorkers() error { } // BuildDeliveryWorker creates and registers the delivery worker. -func (b *ServiceBuilder) BuildDeliveryWorker() error { +func (b *ServiceBuilder) BuildDeliveryWorker(baseRouter *gin.Engine) error { b.logger.Debug("building delivery service worker") // Create a new service instance for Delivery @@ -288,6 +335,9 @@ func (b *ServiceBuilder) BuildDeliveryWorker() error { deliveryIdempotence, ) + // Store reference to the base router + svc.router = baseRouter + // Create DeliveryMQ worker deliveryWorker := NewDeliveryMQWorker( svc.deliveryMQ, @@ -302,7 +352,7 @@ func (b *ServiceBuilder) BuildDeliveryWorker() error { } // BuildLogWorker creates and registers the log worker. -func (b *ServiceBuilder) BuildLogWorker() error { +func (b *ServiceBuilder) BuildLogWorker(baseRouter *gin.Engine) error { b.logger.Debug("building log service worker") // Create a new service instance for Log @@ -349,6 +399,9 @@ func (b *ServiceBuilder) BuildLogWorker() error { logMQ := logmq.New(logmq.WithQueue(logQueueConfig)) + // Store reference to the base router + svc.router = baseRouter + // Create LogMQ worker logWorker := NewLogMQWorker( logMQ, diff --git a/internal/services/health.go b/internal/services/health.go new file mode 100644 index 00000000..84daf751 --- /dev/null +++ b/internal/services/health.go @@ -0,0 +1,39 @@ +package services + +import ( + "net/http" + + "github.com/gin-gonic/gin" + "github.com/hookdeck/outpost/internal/worker" +) + +// HealthHandler creates a health check handler that reports worker supervisor health +func HealthHandler(supervisor *worker.WorkerSupervisor) gin.HandlerFunc { + return func(c *gin.Context) { + tracker := supervisor.GetHealthTracker() + status := tracker.GetStatus() + if tracker.IsHealthy() { + c.JSON(http.StatusOK, status) + } else { + c.JSON(http.StatusServiceUnavailable, status) + } + } +} + +// NewBaseRouter creates a base router with health check endpoint +// This is used by all services to expose /healthz +// +// TODO: Rethink API versioning strategy in the future. +// For now, we expose health check at both /healthz and /api/v1/healthz for backwards compatibility. +// The /api/v1 prefix is hardcoded here but should be part of a broader versioning approach. +func NewBaseRouter(supervisor *worker.WorkerSupervisor, ginMode string) *gin.Engine { + gin.SetMode(ginMode) + r := gin.New() + r.Use(gin.Recovery()) + + healthHandler := HealthHandler(supervisor) + r.GET("/healthz", healthHandler) + r.GET("/api/v1/healthz", healthHandler) + + return r +} From a5896b249c6324f1b68814ae1b6bd85f89003336 Mon Sep 17 00:00:00 2001 From: Alex Luong Date: Tue, 11 Nov 2025 02:56:26 +0700 Subject: [PATCH 09/19] chore: openapi schema --- docs/apis/openapi.yaml | 94 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 89 insertions(+), 5 deletions(-) diff --git a/docs/apis/openapi.yaml b/docs/apis/openapi.yaml index bec4e773..a0989479 100644 --- a/docs/apis/openapi.yaml +++ b/docs/apis/openapi.yaml @@ -1686,17 +1686,101 @@ paths: get: tags: [Health] summary: Health Check - description: Simple health check endpoint. + description: | + Health check endpoint that reports the status of all workers. + + Returns HTTP 200 when all workers are healthy, or HTTP 503 if any worker has failed. + + The response includes: + - `status`: Overall health status ("healthy" or "failed") + - `workers`: Map of worker names to their individual health status + + Each worker reports: + - `status`: Worker health ("healthy" or "failed") + - `last_check`: ISO 8601 timestamp of last health check + + Note: Error details are not exposed for security reasons. Check application logs for detailed error information. operationId: healthCheck security: [] responses: "200": - description: Service is healthy. + description: Service is healthy - all workers are operational. content: - text/plain: + application/json: + schema: + type: object + required: + - status + - workers + properties: + status: + type: string + enum: [healthy] + example: healthy + workers: + type: object + additionalProperties: + type: object + required: + - status + - last_check + properties: + status: + type: string + enum: [healthy] + example: healthy + last_check: + type: string + format: date-time + example: "2025-11-11T10:30:00Z" + example: + status: healthy + workers: + http-server: + status: healthy + last_check: "2025-11-11T10:30:00Z" + retry-scheduler: + status: healthy + last_check: "2025-11-11T10:30:00Z" + "503": + description: Service is unhealthy - one or more workers have failed. + content: + application/json: schema: - type: string - example: OK + type: object + required: + - status + - workers + properties: + status: + type: string + enum: [failed] + example: failed + workers: + type: object + additionalProperties: + type: object + required: + - status + - last_check + properties: + status: + type: string + enum: [healthy, failed] + example: failed + last_check: + type: string + format: date-time + example: "2025-11-11T10:30:10Z" + example: + status: failed + workers: + http-server: + status: healthy + last_check: "2025-11-11T10:30:15Z" + retry-scheduler: + status: failed + last_check: "2025-11-11T10:30:10Z" # Tenants /{tenant_id}: parameters: From 348339f86b73b8bfdf6a508666cfda1eb7c8ca66 Mon Sep 17 00:00:00 2001 From: Alex Luong Date: Tue, 11 Nov 2025 03:00:02 +0700 Subject: [PATCH 10/19] chore: remove unnecessary field --- internal/worker/health.go | 3 --- internal/worker/worker_test.go | 1 - 2 files changed, 4 deletions(-) diff --git a/internal/worker/health.go b/internal/worker/health.go index 9ce71f53..cfbbaed0 100644 --- a/internal/worker/health.go +++ b/internal/worker/health.go @@ -13,7 +13,6 @@ const ( // WorkerHealth represents the health status of a single worker. // Error details are NOT exposed for security reasons. type WorkerHealth struct { - Name string `json:"name"` Status string `json:"status"` // "healthy" or "failed" LastCheck time.Time `json:"last_check"` } @@ -38,7 +37,6 @@ func (h *HealthTracker) MarkHealthy(name string) { defer h.mu.Unlock() h.workers[name] = WorkerHealth{ - Name: name, Status: WorkerStatusHealthy, LastCheck: time.Now(), } @@ -51,7 +49,6 @@ func (h *HealthTracker) MarkFailed(name string) { defer h.mu.Unlock() h.workers[name] = WorkerHealth{ - Name: name, Status: WorkerStatusFailed, LastCheck: time.Now(), } diff --git a/internal/worker/worker_test.go b/internal/worker/worker_test.go index 6f4f9320..b295b756 100644 --- a/internal/worker/worker_test.go +++ b/internal/worker/worker_test.go @@ -169,7 +169,6 @@ func TestHealthTracker_NoErrorExposed(t *testing.T) { // Verify WorkerHealth struct has no Error field (compile-time check via struct) // If Error field existed, this would have compile error _ = WorkerHealth{ - Name: "test", Status: "healthy", LastCheck: time.Now(), } From 932929fac7502325cd7b0bf7b7439238d46a17bf Mon Sep 17 00:00:00 2001 From: Alex Luong Date: Tue, 11 Nov 2025 03:04:01 +0700 Subject: [PATCH 11/19] fix: worker healthcheck logic --- internal/worker/supervisor.go | 2 +- internal/worker/worker_test.go | 15 +++++++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/internal/worker/supervisor.go b/internal/worker/supervisor.go index 3952ecd6..81445f05 100644 --- a/internal/worker/supervisor.go +++ b/internal/worker/supervisor.go @@ -101,6 +101,7 @@ func (r *WorkerSupervisor) Run(ctx context.Context) error { defer wg.Done() r.logger.Info("worker starting", zap.String("worker", name)) + r.health.MarkHealthy(name) // Run the worker if err := w.Run(ctx); err != nil && !errors.Is(err, context.Canceled) { @@ -110,7 +111,6 @@ func (r *WorkerSupervisor) Run(ctx context.Context) error { r.health.MarkFailed(name) } else { r.logger.Info("worker stopped gracefully", zap.String("worker", name)) - r.health.MarkHealthy(name) } }(name, worker) } diff --git a/internal/worker/worker_test.go b/internal/worker/worker_test.go index b295b756..603b084c 100644 --- a/internal/worker/worker_test.go +++ b/internal/worker/worker_test.go @@ -230,8 +230,19 @@ func TestWorkerSupervisor_Run_HealthyWorkers(t *testing.T) { assert.True(t, worker1.WasStarted()) assert.True(t, worker2.WasStarted()) - // Verify health - assert.True(t, supervisor.GetHealthTracker().IsHealthy()) + // Verify health while workers are running + tracker := supervisor.GetHealthTracker() + assert.True(t, tracker.IsHealthy(), "all workers should be healthy while running") + + status := tracker.GetStatus() + assert.Equal(t, "healthy", status["status"]) + + workers := status["workers"].(map[string]WorkerHealth) + assert.Len(t, workers, 2, "should have 2 workers in health status") + assert.Equal(t, WorkerStatusHealthy, workers["worker-1"].Status, "worker-1 should be healthy") + assert.Equal(t, WorkerStatusHealthy, workers["worker-2"].Status, "worker-2 should be healthy") + assert.NotZero(t, workers["worker-1"].LastCheck, "worker-1 should have last_check timestamp") + assert.NotZero(t, workers["worker-2"].LastCheck, "worker-2 should have last_check timestamp") // Cancel context and verify graceful shutdown cancel() From 02016df5f329343d07443cb5c2e4421d21010c54 Mon Sep 17 00:00:00 2001 From: Alex Luong Date: Tue, 11 Nov 2025 03:12:56 +0700 Subject: [PATCH 12/19] chore: change healthz response schema --- cmd/e2e/api_test.go | 3 +++ docs/apis/openapi.yaml | 30 +++++++++++++++--------------- internal/worker/health.go | 14 ++++++-------- internal/worker/worker_test.go | 6 ++---- 4 files changed, 26 insertions(+), 27 deletions(-) diff --git a/cmd/e2e/api_test.go b/cmd/e2e/api_test.go index 380444f9..b8b6114d 100644 --- a/cmd/e2e/api_test.go +++ b/cmd/e2e/api_test.go @@ -29,6 +29,9 @@ func (suite *basicSuite) TestHealthzAPI() { "status": map[string]interface{}{ "type": "string", }, + "timestamp": map[string]interface{}{ + "type": "string", + }, "workers": map[string]interface{}{ "type": "object", }, diff --git a/docs/apis/openapi.yaml b/docs/apis/openapi.yaml index a0989479..6ae97d20 100644 --- a/docs/apis/openapi.yaml +++ b/docs/apis/openapi.yaml @@ -1693,11 +1693,11 @@ paths: The response includes: - `status`: Overall health status ("healthy" or "failed") + - `timestamp`: When this health check was performed (ISO 8601 format) - `workers`: Map of worker names to their individual health status Each worker reports: - `status`: Worker health ("healthy" or "failed") - - `last_check`: ISO 8601 timestamp of last health check Note: Error details are not exposed for security reasons. Check application logs for detailed error information. operationId: healthCheck @@ -1711,37 +1711,37 @@ paths: type: object required: - status + - timestamp - workers properties: status: type: string enum: [healthy] example: healthy + timestamp: + type: string + format: date-time + description: When this health check was performed + example: "2025-11-11T10:30:00Z" workers: type: object additionalProperties: type: object required: - status - - last_check properties: status: type: string enum: [healthy] example: healthy - last_check: - type: string - format: date-time - example: "2025-11-11T10:30:00Z" example: status: healthy + timestamp: "2025-11-11T10:30:00Z" workers: http-server: status: healthy - last_check: "2025-11-11T10:30:00Z" retry-scheduler: status: healthy - last_check: "2025-11-11T10:30:00Z" "503": description: Service is unhealthy - one or more workers have failed. content: @@ -1750,37 +1750,37 @@ paths: type: object required: - status + - timestamp - workers properties: status: type: string enum: [failed] example: failed + timestamp: + type: string + format: date-time + description: When this health check was performed + example: "2025-11-11T10:30:15Z" workers: type: object additionalProperties: type: object required: - status - - last_check properties: status: type: string enum: [healthy, failed] example: failed - last_check: - type: string - format: date-time - example: "2025-11-11T10:30:10Z" example: status: failed + timestamp: "2025-11-11T10:30:15Z" workers: http-server: status: healthy - last_check: "2025-11-11T10:30:15Z" retry-scheduler: status: failed - last_check: "2025-11-11T10:30:10Z" # Tenants /{tenant_id}: parameters: diff --git a/internal/worker/health.go b/internal/worker/health.go index cfbbaed0..24964557 100644 --- a/internal/worker/health.go +++ b/internal/worker/health.go @@ -13,8 +13,7 @@ const ( // WorkerHealth represents the health status of a single worker. // Error details are NOT exposed for security reasons. type WorkerHealth struct { - Status string `json:"status"` // "healthy" or "failed" - LastCheck time.Time `json:"last_check"` + Status string `json:"status"` // "healthy" or "failed" } // HealthTracker tracks the health status of all workers. @@ -37,8 +36,7 @@ func (h *HealthTracker) MarkHealthy(name string) { defer h.mu.Unlock() h.workers[name] = WorkerHealth{ - Status: WorkerStatusHealthy, - LastCheck: time.Now(), + Status: WorkerStatusHealthy, } } @@ -49,8 +47,7 @@ func (h *HealthTracker) MarkFailed(name string) { defer h.mu.Unlock() h.workers[name] = WorkerHealth{ - Status: WorkerStatusFailed, - LastCheck: time.Now(), + Status: WorkerStatusFailed, } } @@ -83,8 +80,9 @@ func (h *HealthTracker) GetStatus() map[string]interface{} { } return map[string]interface{}{ - "status": status, - "workers": workers, + "status": status, + "timestamp": time.Now(), + "workers": workers, } } diff --git a/internal/worker/worker_test.go b/internal/worker/worker_test.go index 603b084c..eab0dc71 100644 --- a/internal/worker/worker_test.go +++ b/internal/worker/worker_test.go @@ -169,8 +169,7 @@ func TestHealthTracker_NoErrorExposed(t *testing.T) { // Verify WorkerHealth struct has no Error field (compile-time check via struct) // If Error field existed, this would have compile error _ = WorkerHealth{ - Status: "healthy", - LastCheck: time.Now(), + Status: "healthy", } } @@ -241,8 +240,7 @@ func TestWorkerSupervisor_Run_HealthyWorkers(t *testing.T) { assert.Len(t, workers, 2, "should have 2 workers in health status") assert.Equal(t, WorkerStatusHealthy, workers["worker-1"].Status, "worker-1 should be healthy") assert.Equal(t, WorkerStatusHealthy, workers["worker-2"].Status, "worker-2 should be healthy") - assert.NotZero(t, workers["worker-1"].LastCheck, "worker-1 should have last_check timestamp") - assert.NotZero(t, workers["worker-2"].LastCheck, "worker-2 should have last_check timestamp") + assert.NotZero(t, status["timestamp"], "should have timestamp field") // Cancel context and verify graceful shutdown cancel() From 228a121793bf680917c3bc1946cc33db61da2143 Mon Sep 17 00:00:00 2001 From: Alex Luong Date: Tue, 11 Nov 2025 03:20:18 +0700 Subject: [PATCH 13/19] chore: logger --- internal/worker/supervisor.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/worker/supervisor.go b/internal/worker/supervisor.go index 81445f05..9be4aab1 100644 --- a/internal/worker/supervisor.go +++ b/internal/worker/supervisor.go @@ -62,7 +62,7 @@ func (r *WorkerSupervisor) Register(w Worker) { panic(fmt.Sprintf("worker %s already registered", w.Name())) } r.workers[w.Name()] = w - r.logger.Debug("worker registered", zap.String("worker", w.Name())) + r.logger.Info("worker registered", zap.String("worker", w.Name())) } // GetHealthTracker returns the health tracker for this supervisor. @@ -100,7 +100,7 @@ func (r *WorkerSupervisor) Run(ctx context.Context) error { go func(name string, w Worker) { defer wg.Done() - r.logger.Info("worker starting", zap.String("worker", name)) + r.logger.Debug("worker starting", zap.String("worker", name)) r.health.MarkHealthy(name) // Run the worker From f39d4b7621b2bb8497d088a4170c5df4703e6c86 Mon Sep 17 00:00:00 2001 From: Alex Luong Date: Tue, 11 Nov 2025 03:40:00 +0700 Subject: [PATCH 14/19] chore: rename retry scheduler to retrymq consumer --- docs/apis/openapi.yaml | 4 ++-- internal/services/builder.go | 4 ++-- ...try_scheduler_worker.go => retrymq_worker.go} | 16 ++++++++-------- internal/worker/worker.go | 2 +- 4 files changed, 13 insertions(+), 13 deletions(-) rename internal/services/{retry_scheduler_worker.go => retrymq_worker.go} (60%) diff --git a/docs/apis/openapi.yaml b/docs/apis/openapi.yaml index 6ae97d20..dda75909 100644 --- a/docs/apis/openapi.yaml +++ b/docs/apis/openapi.yaml @@ -1740,7 +1740,7 @@ paths: workers: http-server: status: healthy - retry-scheduler: + retrymq-consumer: status: healthy "503": description: Service is unhealthy - one or more workers have failed. @@ -1779,7 +1779,7 @@ paths: workers: http-server: status: healthy - retry-scheduler: + retrymq-consumer: status: failed # Tenants /{tenant_id}: diff --git a/internal/services/builder.go b/internal/services/builder.go index a5a28a97..e7dc3915 100644 --- a/internal/services/builder.go +++ b/internal/services/builder.go @@ -236,8 +236,8 @@ func (b *ServiceBuilder) BuildAPIWorkers(baseRouter *gin.Engine) error { svc.router = baseRouter - // Worker 1: Retry Scheduler - retryWorker := NewRetrySchedulerWorker(svc.retryScheduler, b.logger) + // Worker 1: RetryMQ Consumer + retryWorker := NewRetryMQWorker(svc.retryScheduler, b.logger) b.supervisor.Register(retryWorker) // Worker 2: PublishMQ Consumer (optional) diff --git a/internal/services/retry_scheduler_worker.go b/internal/services/retrymq_worker.go similarity index 60% rename from internal/services/retry_scheduler_worker.go rename to internal/services/retrymq_worker.go index ac1515e4..010e61a5 100644 --- a/internal/services/retry_scheduler_worker.go +++ b/internal/services/retrymq_worker.go @@ -9,27 +9,27 @@ import ( "go.uber.org/zap" ) -// RetrySchedulerWorker wraps a retry scheduler as a worker. -type RetrySchedulerWorker struct { +// RetryMQWorker wraps a retry scheduler as a worker. +type RetryMQWorker struct { scheduler scheduler.Scheduler logger *logging.Logger } -// NewRetrySchedulerWorker creates a new retry scheduler worker. -func NewRetrySchedulerWorker(scheduler scheduler.Scheduler, logger *logging.Logger) worker.Worker { - return &RetrySchedulerWorker{ +// NewRetryMQWorker creates a new retry scheduler worker. +func NewRetryMQWorker(scheduler scheduler.Scheduler, logger *logging.Logger) worker.Worker { + return &RetryMQWorker{ scheduler: scheduler, logger: logger, } } // Name returns the worker name. -func (w *RetrySchedulerWorker) Name() string { - return "retry-scheduler" +func (w *RetryMQWorker) Name() string { + return "retrymq-consumer" } // Run starts the retry scheduler monitor and blocks until context is cancelled or it fails. -func (w *RetrySchedulerWorker) Run(ctx context.Context) error { +func (w *RetryMQWorker) Run(ctx context.Context) error { logger := w.logger.Ctx(ctx) logger.Info("retry scheduler monitor running") diff --git a/internal/worker/worker.go b/internal/worker/worker.go index 538b4a4e..0e2a955a 100644 --- a/internal/worker/worker.go +++ b/internal/worker/worker.go @@ -10,7 +10,7 @@ import "context" // - Return nil or context.Canceled for graceful shutdown // - Return non-nil error only for fatal failures type Worker interface { - // Name returns a unique identifier for this worker (e.g., "http-server", "retry-scheduler") + // Name returns a unique identifier for this worker (e.g., "http-server", "retrymq-consumer") Name() string // Run executes the worker and blocks until context is cancelled or error occurs. From 9f4acfac48c7141155bc6bdf285bcb99d04abdcb Mon Sep 17 00:00:00 2001 From: Alex Luong Date: Tue, 11 Nov 2025 18:20:40 +0700 Subject: [PATCH 15/19] chore: README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2d563b03..91ec599b 100644 --- a/README.md +++ b/README.md @@ -173,7 +173,7 @@ Check the services are running: curl $OUTPOST_URL/api/v1/healthz ``` -Wait until you get a `OK%` response. +Wait until you get a 200 response. Create a tenant with the following command, replacing `$TENANT_ID` with a unique identifier such as "your_org_name", and the `$API_KEY` with the value you set in your `.env`: From cb39126d0c951e2151d6de917010b66de7a0cfab Mon Sep 17 00:00:00 2001 From: Alex Luong Date: Thu, 13 Nov 2025 22:47:35 +0700 Subject: [PATCH 16/19] fix: remove duplicate http server shutdown --- internal/services/builder.go | 27 +-------------------------- 1 file changed, 1 insertion(+), 26 deletions(-) diff --git a/internal/services/builder.go b/internal/services/builder.go index e7dc3915..63dcab4e 100644 --- a/internal/services/builder.go +++ b/internal/services/builder.go @@ -118,33 +118,8 @@ func (b *ServiceBuilder) createHTTPServer(router http.Handler) error { Handler: router, } - // Add cleanup for HTTP server to the first service (or API service for service=all) - serviceType := b.cfg.MustGetService() - var targetSvc *serviceInstance - if serviceType == config.ServiceTypeAll { - // For service=all, prefer API service for cleanup - for _, svc := range b.services { - if svc.name == "api" { - targetSvc = svc - break - } - } - } - if targetSvc == nil && len(b.services) > 0 { - // Fallback to first service - targetSvc = b.services[0] - } - - if targetSvc != nil { - targetSvc.cleanupFuncs = append(targetSvc.cleanupFuncs, func(ctx context.Context, logger *logging.LoggerWithCtx) { - if err := httpServer.Shutdown(ctx); err != nil { - logger.Error("error shutting down http server", zap.Error(err)) - } - logger.Info("http server shut down") - }) - } - // Register HTTP server worker + // Note: HTTP server shutdown is handled by HTTPServerWorker, not in cleanup functions httpWorker := NewHTTPServerWorker(httpServer, b.logger) b.supervisor.Register(httpWorker) From 02880eab641d902a19cdbdd49e64b574c8efbac7 Mon Sep 17 00:00:00 2001 From: Alex Luong Date: Mon, 17 Nov 2025 11:06:04 +0700 Subject: [PATCH 17/19] fix: improve context error handling in MQ workers --- internal/services/deliverymq_worker.go | 9 ++++++--- internal/services/logmq_worker.go | 9 ++++++--- internal/services/publishmq_worker.go | 9 ++++++--- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/internal/services/deliverymq_worker.go b/internal/services/deliverymq_worker.go index a0684b25..90cf5347 100644 --- a/internal/services/deliverymq_worker.go +++ b/internal/services/deliverymq_worker.go @@ -55,9 +55,12 @@ func (w *DeliveryMQWorker) Run(ctx context.Context) error { consumer.WithConcurrency(w.concurrency), ) - if err := csm.Run(ctx); !errors.Is(err, ctx.Err()) { - logger.Error("error running deliverymq consumer", zap.Error(err)) - return err + if err := csm.Run(ctx); err != nil { + // Only report as failure if it's not a graceful shutdown + if !errors.Is(err, context.Canceled) && !errors.Is(err, context.DeadlineExceeded) { + logger.Error("error running deliverymq consumer", zap.Error(err)) + return err + } } return nil diff --git a/internal/services/logmq_worker.go b/internal/services/logmq_worker.go index 871b4f38..d6c351b1 100644 --- a/internal/services/logmq_worker.go +++ b/internal/services/logmq_worker.go @@ -55,9 +55,12 @@ func (w *LogMQWorker) Run(ctx context.Context) error { consumer.WithConcurrency(w.concurrency), ) - if err := csm.Run(ctx); !errors.Is(err, ctx.Err()) { - logger.Error("error running logmq consumer", zap.Error(err)) - return err + if err := csm.Run(ctx); err != nil { + // Only report as failure if it's not a graceful shutdown + if !errors.Is(err, context.Canceled) && !errors.Is(err, context.DeadlineExceeded) { + logger.Error("error running logmq consumer", zap.Error(err)) + return err + } } return nil diff --git a/internal/services/publishmq_worker.go b/internal/services/publishmq_worker.go index 0b77b30b..21860979 100644 --- a/internal/services/publishmq_worker.go +++ b/internal/services/publishmq_worker.go @@ -56,9 +56,12 @@ func (w *PublishMQWorker) Run(ctx context.Context) error { consumer.WithConcurrency(w.concurrency), ) - if err := csm.Run(ctx); !errors.Is(err, ctx.Err()) { - logger.Error("error running publishmq consumer", zap.Error(err)) - return err + if err := csm.Run(ctx); err != nil { + // Only report as failure if it's not a graceful shutdown + if !errors.Is(err, context.Canceled) && !errors.Is(err, context.DeadlineExceeded) { + logger.Error("error running publishmq consumer", zap.Error(err)) + return err + } } return nil From 2f96681a54fe9221e89e74afd3b59ad2047d25f0 Mon Sep 17 00:00:00 2001 From: Alex Luong Date: Mon, 17 Nov 2025 11:16:45 +0700 Subject: [PATCH 18/19] refactor: generic consumer worker --- internal/services/builder.go | 18 ++++--- internal/services/consumer_worker.go | 71 ++++++++++++++++++++++++++ internal/services/deliverymq_worker.go | 67 ------------------------ internal/services/logmq_worker.go | 67 ------------------------ internal/services/publishmq_worker.go | 68 ------------------------ 5 files changed, 82 insertions(+), 209 deletions(-) create mode 100644 internal/services/consumer_worker.go delete mode 100644 internal/services/deliverymq_worker.go delete mode 100644 internal/services/logmq_worker.go delete mode 100644 internal/services/publishmq_worker.go diff --git a/internal/services/builder.go b/internal/services/builder.go index 63dcab4e..9b6eb602 100644 --- a/internal/services/builder.go +++ b/internal/services/builder.go @@ -218,9 +218,11 @@ func (b *ServiceBuilder) BuildAPIWorkers(baseRouter *gin.Engine) error { // Worker 2: PublishMQ Consumer (optional) if b.cfg.PublishMQ.GetQueueConfig() != nil { publishMQ := publishmq.New(publishmq.WithQueue(b.cfg.PublishMQ.GetQueueConfig())) - publishMQWorker := NewPublishMQWorker( - publishMQ, - eventHandler, + messageHandler := publishmq.NewMessageHandler(eventHandler) + publishMQWorker := NewConsumerWorker( + "publishmq-consumer", + publishMQ.Subscribe, + messageHandler, b.cfg.PublishMaxConcurrency, b.logger, ) @@ -314,8 +316,9 @@ func (b *ServiceBuilder) BuildDeliveryWorker(baseRouter *gin.Engine) error { svc.router = baseRouter // Create DeliveryMQ worker - deliveryWorker := NewDeliveryMQWorker( - svc.deliveryMQ, + deliveryWorker := NewConsumerWorker( + "deliverymq-consumer", + svc.deliveryMQ.Subscribe, handler, b.cfg.DeliveryMaxConcurrency, b.logger, @@ -378,8 +381,9 @@ func (b *ServiceBuilder) BuildLogWorker(baseRouter *gin.Engine) error { svc.router = baseRouter // Create LogMQ worker - logWorker := NewLogMQWorker( - logMQ, + logWorker := NewConsumerWorker( + "logmq-consumer", + logMQ.Subscribe, handler, b.cfg.DeliveryMaxConcurrency, b.logger, diff --git a/internal/services/consumer_worker.go b/internal/services/consumer_worker.go new file mode 100644 index 00000000..6ff3a913 --- /dev/null +++ b/internal/services/consumer_worker.go @@ -0,0 +1,71 @@ +package services + +import ( + "context" + "errors" + + "github.com/hookdeck/outpost/internal/consumer" + "github.com/hookdeck/outpost/internal/logging" + "github.com/hookdeck/outpost/internal/mqs" + "github.com/hookdeck/outpost/internal/worker" + "go.uber.org/zap" +) + +// ConsumerWorker is a generic worker that wraps a message queue consumer. +// It handles subscription at runtime and consistent error handling for graceful shutdowns. +type ConsumerWorker struct { + name string + subscribe func(ctx context.Context) (mqs.Subscription, error) + handler consumer.MessageHandler + concurrency int + logger *logging.Logger +} + +// NewConsumerWorker creates a new generic consumer worker. +func NewConsumerWorker( + name string, + subscribe func(ctx context.Context) (mqs.Subscription, error), + handler consumer.MessageHandler, + concurrency int, + logger *logging.Logger, +) worker.Worker { + return &ConsumerWorker{ + name: name, + subscribe: subscribe, + handler: handler, + concurrency: concurrency, + logger: logger, + } +} + +// Name returns the worker name. +func (w *ConsumerWorker) Name() string { + return w.name +} + +// Run starts the consumer and blocks until context is cancelled or it fails. +func (w *ConsumerWorker) Run(ctx context.Context) error { + logger := w.logger.Ctx(ctx) + logger.Info("consumer worker starting", zap.String("name", w.name)) + + subscription, err := w.subscribe(ctx) + if err != nil { + logger.Error("error subscribing", zap.String("name", w.name), zap.Error(err)) + return err + } + + csm := consumer.New(subscription, w.handler, + consumer.WithName(w.name), + consumer.WithConcurrency(w.concurrency), + ) + + if err := csm.Run(ctx); err != nil { + // Only report as failure if it's not a graceful shutdown + if !errors.Is(err, context.Canceled) && !errors.Is(err, context.DeadlineExceeded) { + logger.Error("error running consumer", zap.String("name", w.name), zap.Error(err)) + return err + } + } + + return nil +} diff --git a/internal/services/deliverymq_worker.go b/internal/services/deliverymq_worker.go deleted file mode 100644 index 90cf5347..00000000 --- a/internal/services/deliverymq_worker.go +++ /dev/null @@ -1,67 +0,0 @@ -package services - -import ( - "context" - "errors" - - "github.com/hookdeck/outpost/internal/consumer" - "github.com/hookdeck/outpost/internal/deliverymq" - "github.com/hookdeck/outpost/internal/logging" - "github.com/hookdeck/outpost/internal/worker" - "go.uber.org/zap" -) - -// DeliveryMQWorker wraps a DeliveryMQ consumer as a worker. -type DeliveryMQWorker struct { - deliveryMQ *deliverymq.DeliveryMQ - handler consumer.MessageHandler - concurrency int - logger *logging.Logger -} - -// NewDeliveryMQWorker creates a new DeliveryMQ consumer worker. -func NewDeliveryMQWorker( - deliveryMQ *deliverymq.DeliveryMQ, - handler consumer.MessageHandler, - concurrency int, - logger *logging.Logger, -) worker.Worker { - return &DeliveryMQWorker{ - deliveryMQ: deliveryMQ, - handler: handler, - concurrency: concurrency, - logger: logger, - } -} - -// Name returns the worker name. -func (w *DeliveryMQWorker) Name() string { - return "deliverymq-consumer" -} - -// Run starts the DeliveryMQ consumer and blocks until context is cancelled or it fails. -func (w *DeliveryMQWorker) Run(ctx context.Context) error { - logger := w.logger.Ctx(ctx) - logger.Info("deliverymq consumer running") - - subscription, err := w.deliveryMQ.Subscribe(ctx) - if err != nil { - logger.Error("error subscribing to deliverymq", zap.Error(err)) - return err - } - - csm := consumer.New(subscription, w.handler, - consumer.WithName("deliverymq"), - consumer.WithConcurrency(w.concurrency), - ) - - if err := csm.Run(ctx); err != nil { - // Only report as failure if it's not a graceful shutdown - if !errors.Is(err, context.Canceled) && !errors.Is(err, context.DeadlineExceeded) { - logger.Error("error running deliverymq consumer", zap.Error(err)) - return err - } - } - - return nil -} diff --git a/internal/services/logmq_worker.go b/internal/services/logmq_worker.go deleted file mode 100644 index d6c351b1..00000000 --- a/internal/services/logmq_worker.go +++ /dev/null @@ -1,67 +0,0 @@ -package services - -import ( - "context" - "errors" - - "github.com/hookdeck/outpost/internal/consumer" - "github.com/hookdeck/outpost/internal/logging" - "github.com/hookdeck/outpost/internal/logmq" - "github.com/hookdeck/outpost/internal/worker" - "go.uber.org/zap" -) - -// LogMQWorker wraps a LogMQ consumer as a worker. -type LogMQWorker struct { - logMQ *logmq.LogMQ - handler consumer.MessageHandler - concurrency int - logger *logging.Logger -} - -// NewLogMQWorker creates a new LogMQ consumer worker. -func NewLogMQWorker( - logMQ *logmq.LogMQ, - handler consumer.MessageHandler, - concurrency int, - logger *logging.Logger, -) worker.Worker { - return &LogMQWorker{ - logMQ: logMQ, - handler: handler, - concurrency: concurrency, - logger: logger, - } -} - -// Name returns the worker name. -func (w *LogMQWorker) Name() string { - return "logmq-consumer" -} - -// Run starts the LogMQ consumer and blocks until context is cancelled or it fails. -func (w *LogMQWorker) Run(ctx context.Context) error { - logger := w.logger.Ctx(ctx) - logger.Info("logmq consumer running") - - subscription, err := w.logMQ.Subscribe(ctx) - if err != nil { - logger.Error("error subscribing to logmq", zap.Error(err)) - return err - } - - csm := consumer.New(subscription, w.handler, - consumer.WithName("logmq"), - consumer.WithConcurrency(w.concurrency), - ) - - if err := csm.Run(ctx); err != nil { - // Only report as failure if it's not a graceful shutdown - if !errors.Is(err, context.Canceled) && !errors.Is(err, context.DeadlineExceeded) { - logger.Error("error running logmq consumer", zap.Error(err)) - return err - } - } - - return nil -} diff --git a/internal/services/publishmq_worker.go b/internal/services/publishmq_worker.go deleted file mode 100644 index 21860979..00000000 --- a/internal/services/publishmq_worker.go +++ /dev/null @@ -1,68 +0,0 @@ -package services - -import ( - "context" - "errors" - - "github.com/hookdeck/outpost/internal/consumer" - "github.com/hookdeck/outpost/internal/logging" - "github.com/hookdeck/outpost/internal/publishmq" - "github.com/hookdeck/outpost/internal/worker" - "go.uber.org/zap" -) - -// PublishMQWorker wraps a PublishMQ consumer as a worker. -type PublishMQWorker struct { - publishMQ *publishmq.PublishMQ - eventHandler publishmq.EventHandler - concurrency int - logger *logging.Logger -} - -// NewPublishMQWorker creates a new PublishMQ consumer worker. -func NewPublishMQWorker( - publishMQ *publishmq.PublishMQ, - eventHandler publishmq.EventHandler, - concurrency int, - logger *logging.Logger, -) worker.Worker { - return &PublishMQWorker{ - publishMQ: publishMQ, - eventHandler: eventHandler, - concurrency: concurrency, - logger: logger, - } -} - -// Name returns the worker name. -func (w *PublishMQWorker) Name() string { - return "publishmq-consumer" -} - -// Run starts the PublishMQ consumer and blocks until context is cancelled or it fails. -func (w *PublishMQWorker) Run(ctx context.Context) error { - logger := w.logger.Ctx(ctx) - logger.Info("publishmq consumer running") - - subscription, err := w.publishMQ.Subscribe(ctx) - if err != nil { - logger.Error("error subscribing to publishmq", zap.Error(err)) - return err - } - - messageHandler := publishmq.NewMessageHandler(w.eventHandler) - csm := consumer.New(subscription, messageHandler, - consumer.WithName("publishmq"), - consumer.WithConcurrency(w.concurrency), - ) - - if err := csm.Run(ctx); err != nil { - // Only report as failure if it's not a graceful shutdown - if !errors.Is(err, context.Canceled) && !errors.Is(err, context.DeadlineExceeded) { - logger.Error("error running publishmq consumer", zap.Error(err)) - return err - } - } - - return nil -} From e0808cd432f1f09cd894a6787fb3e30f9d0722da Mon Sep 17 00:00:00 2001 From: Alex Luong Date: Mon, 17 Nov 2025 11:20:42 +0700 Subject: [PATCH 19/19] chore: lifo cleanup --- internal/services/builder.go | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/internal/services/builder.go b/internal/services/builder.go index 9b6eb602..31a98961 100644 --- a/internal/services/builder.go +++ b/internal/services/builder.go @@ -128,12 +128,18 @@ func (b *ServiceBuilder) createHTTPServer(router http.Handler) error { } // Cleanup runs all registered cleanup functions for all services. +// Cleanup is performed in LIFO (last-in-first-out) order to ensure that +// resources created later (which may depend on earlier resources) are +// cleaned up before their dependencies. func (b *ServiceBuilder) Cleanup(ctx context.Context) { logger := b.logger.Ctx(ctx) - for _, svc := range b.services { + // Clean up services in reverse order (LIFO) + for i := len(b.services) - 1; i >= 0; i-- { + svc := b.services[i] logger.Debug("cleaning up service", zap.String("service", svc.name)) - for _, cleanupFunc := range svc.cleanupFuncs { - cleanupFunc(ctx, &logger) + // Clean up functions within each service in reverse order + for j := len(svc.cleanupFuncs) - 1; j >= 0; j-- { + svc.cleanupFuncs[j](ctx, &logger) } } }