From 73ffdb37b5e1ed9f6b953c853723e1f5dadc60f9 Mon Sep 17 00:00:00 2001 From: Dmitrii Okunev Date: Tue, 4 Jul 2023 15:57:43 +0100 Subject: [PATCH] [jobmanager] Recover from job panics Signed-off-by: Dmitrii Okunev --- pkg/jobmanager/jobmanager.go | 4 ++++ pkg/jobmanager/start.go | 3 +++ 2 files changed, 7 insertions(+) diff --git a/pkg/jobmanager/jobmanager.go b/pkg/jobmanager/jobmanager.go index 6cd22310..7ceead76 100644 --- a/pkg/jobmanager/jobmanager.go +++ b/pkg/jobmanager/jobmanager.go @@ -14,6 +14,7 @@ import ( "time" "github.com/facebookincubator/go-belt/beltctx" + "github.com/facebookincubator/go-belt/tool/experimental/errmon" "github.com/insomniacslk/xjson" "github.com/linuxboot/contest/pkg/api" @@ -183,6 +184,9 @@ loop: logging.Debugf(ev.Context, "Handling event %+v", ev) handlerWg.Add(1) go func() { + defer func() { + errmon.ObserveRecoverCtx(ev.Context, recover()) + }() defer handlerWg.Done() jm.handleEvent(ev) }() diff --git a/pkg/jobmanager/start.go b/pkg/jobmanager/start.go index 238580ff..097561fe 100644 --- a/pkg/jobmanager/start.go +++ b/pkg/jobmanager/start.go @@ -102,6 +102,9 @@ func (jm *JobManager) startJob(ctx context.Context, j *job.Job, resumeState *job } func (jm *JobManager) runJob(ctx context.Context, j *job.Job, resumeState *job.PauseEventPayload) { + defer func() { + errmon.ObserveRecoverCtx(ctx, recover()) + }() defer func() { jm.jobsMu.Lock() delete(jm.jobs, j.ID)