Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: segfault on auto-revert with namespace #473

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 14 additions & 13 deletions levant/auto_revert.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,48 +3,48 @@ package levant
import (
"time"

"github.com/hashicorp/nomad/api"
nomad "github.com/hashicorp/nomad/api"
"github.com/rs/zerolog/log"
)

func (l *levantDeployment) autoRevert(jobID, depID *string) {
func (l *levantDeployment) autoRevert(dep *nomad.Deployment) {

// Setup a loop in order to retry a race condition whereby Levant may query
// the latest deployment (auto-revert dep) before it has been started.
i := 0
for i := 0; i < 5; i++ {

dep, _, err := l.nomad.Jobs().LatestDeployment(*jobID, nil)
revertDep, _, err := l.nomad.Jobs().LatestDeployment(dep.JobID, &api.QueryOptions{Namespace: dep.Namespace})
if err != nil {
log.Error().Msgf("levant/auto_revert: unable to query latest deployment of job %s", *jobID)
log.Error().Msgf("levant/auto_revert: unable to query latest deployment of job %s", dep.JobID)
return
}

// Check whether we have got the original deployment ID as a return from
// Nomad, and if so, continue the loop to try again.
if dep.ID == *depID {
log.Debug().Msgf("levant/auto_revert: auto-revert deployment not triggered for job %s, rechecking", *jobID)
if revertDep.ID == dep.ID {
log.Debug().Msgf("levant/auto_revert: auto-revert deployment not triggered for job %s, rechecking", dep.JobID)
time.Sleep(1 * time.Second)
continue
}

log.Info().Msgf("levant/auto_revert: beginning deployment watcher for job %s", *jobID)
success := l.deploymentWatcher(dep.ID)
log.Info().Msgf("levant/auto_revert: beginning deployment watcher for job %s", dep.JobID)
success := l.deploymentWatcher(revertDep.ID)

if success {
log.Info().Msgf("levant/auto_revert: auto-revert of job %s was successful", *jobID)
log.Info().Msgf("levant/auto_revert: auto-revert of job %s was successful", dep.JobID)
break
} else {
log.Error().Msgf("levant/auto_revert: auto-revert of job %s failed; POTENTIAL OUTAGE SITUATION", *jobID)
l.checkFailedDeployment(&dep.ID)
log.Error().Msgf("levant/auto_revert: auto-revert of job %s failed; POTENTIAL OUTAGE SITUATION", dep.JobID)
l.checkFailedDeployment(&revertDep.ID)
break
}
}

// At this point we have not been able to get the latest deploymentID that
// is different from the original so we can't perform auto-revert checking.
if i == 5 {
log.Error().Msgf("levant/auto_revert: unable to check auto-revert of job %s", *jobID)
log.Error().Msgf("levant/auto_revert: unable to check auto-revert of job %s", dep.JobID)
}
}

Expand All @@ -60,6 +60,7 @@ func (l *levantDeployment) checkAutoRevert(dep *nomad.Deployment) {
for _, v := range dep.TaskGroups {
if v.AutoRevert {
revert = true
break
}
}

Expand All @@ -68,7 +69,7 @@ func (l *levantDeployment) checkAutoRevert(dep *nomad.Deployment) {
dep.JobID)

// Run the levant autoRevert function.
l.autoRevert(&dep.JobID, &dep.ID)
l.autoRevert(dep)
} else {
log.Info().Msgf("levant/auto_revert: job %v is not in auto-revert; POTENTIAL OUTAGE SITUATION", dep.JobID)
}
Expand Down
18 changes: 14 additions & 4 deletions test/acctest/acctest.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ type TestCase struct {
// Steps are ran in order stopping on failure
Steps []TestStep

// SetupFunc is called at before the steps
SetupFunc TestStateFunc

// CleanupFunc is called at the end of the TestCase
CleanupFunc TestStateFunc
}
Expand Down Expand Up @@ -42,8 +45,9 @@ type TestStateFunc func(*TestState) error

// TestState is the configuration for the TestCase
type TestState struct {
JobName string
Nomad *nomad.Client
JobName string
Namespace string
Nomad *nomad.Client
}

// Test executes a single TestCase
Expand All @@ -62,6 +66,12 @@ func Test(t *testing.T, c TestCase) {
Nomad: nomad,
}

if c.SetupFunc != nil {
if err := c.SetupFunc(state); err != nil {
t.Errorf("setup failed: %s", err)
}
}

for i, step := range c.Steps {
stepNum := i + 1

Expand Down Expand Up @@ -102,15 +112,15 @@ func Test(t *testing.T, c TestCase) {

// CleanupPurgeJob is a cleanup func to purge the TestCase job from Nomad
func CleanupPurgeJob(s *TestState) error {
_, _, err := s.Nomad.Jobs().Deregister(s.JobName, true, nil)
_, _, err := s.Nomad.Jobs().Deregister(s.JobName, true, &nomad.WriteOptions{Namespace: s.Namespace})
return err
}

// CheckDeploymentStatus is a TestStateFunc to check if the latest deployment of
// the TestCase job matches the desired status
func CheckDeploymentStatus(status string) TestStateFunc {
return func(s *TestState) error {
deploy, _, err := s.Nomad.Jobs().LatestDeployment(s.JobName, nil)
deploy, _, err := s.Nomad.Jobs().LatestDeployment(s.JobName, &nomad.QueryOptions{Namespace: s.Namespace})
if err != nil {
return err
}
Expand Down
4 changes: 4 additions & 0 deletions test/acctest/deploy.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@ func (c DeployTestStepRunner) Run(s *TestState) error {
return fmt.Errorf("error rendering template: %s", err)
}

if job.Namespace != nil {
s.Namespace = *job.Namespace
}

cfg := &levant.DeployConfig{
Deploy: &structs.DeployConfig{
Canary: c.Canary,
Expand Down
31 changes: 31 additions & 0 deletions test/deploy_test.go
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
package test

import (
"fmt"
"testing"

"github.com/hashicorp/levant/test/acctest"
"github.com/hashicorp/nomad/api"
)

func TestDeploy_basic(t *testing.T) {
Expand All @@ -20,6 +22,35 @@ func TestDeploy_basic(t *testing.T) {
})
}

func TestDeploy_namespace(t *testing.T) {
acctest.Test(t, acctest.TestCase{
SetupFunc: func(s *acctest.TestState) error {
if _, err := s.Nomad.Namespaces().Register(&api.Namespace{Name: "test"}, nil); err != nil {
return fmt.Errorf("could not create test namespace: %w", err)
}
return nil
},
Steps: []acctest.TestStep{
{
Runner: acctest.DeployTestStepRunner{
FixtureName: "deploy_namespace.nomad",
},
Check: acctest.CheckDeploymentStatus("successful"),
},
},
CleanupFunc: func(s *acctest.TestState) error {
if err := acctest.CleanupPurgeJob(s); err != nil {
return err
}

if _, err := s.Nomad.Namespaces().Delete("test", nil); err != nil {
return fmt.Errorf("could not delete namespace test: %w", err)
}
return nil
},
})
}

func TestDeploy_driverError(t *testing.T) {
acctest.Test(t, acctest.TestCase{
Steps: []acctest.TestStep{
Expand Down
43 changes: 43 additions & 0 deletions test/fixtures/deploy_namespace.nomad
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# tests a healthy deployment

job "[[.job_name]]" {
datacenters = ["dc1"]
type = "service"
namespace = "test"
update {
max_parallel = 1
min_healthy_time = "10s"
healthy_deadline = "1m"
auto_revert = true
}

group "test" {
count = 1
restart {
attempts = 10
interval = "5m"
delay = "25s"
mode = "delay"
}
ephemeral_disk {
size = 300
}
task "alpine" {
driver = "docker"
config {
image = "alpine"
command = "tail"
args = [
"-f", "/dev/null"
]
}
resources {
cpu = 100
memory = 128
network {
mbits = 10
}
}
}
}
}