Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support marking nodes for maintenance #15

Merged
merged 6 commits into from
Nov 24, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -28,5 +28,5 @@ coverage.out
.godeps/
build/
release/
containerbuddy
examples/*/opt/containerbuddy/containerbuddy
*.tar.gz
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,19 @@ Other fields:

*Note that if you're using `curl` to check HTTP endpoints for health checks, that it doesn't return a non-zero exit code on 404s or similar failure modes by default. Use the `--fail` flag for curl if you need to catch those cases.*

### Operating Containerbuddy

Containerbuddy accepts POSIX signals to change its runtime behavior. Currently, Containerbuddy accepts the following signals.
- `SIGUSR1` will cause Containerbuddy to mark its advertised service for maintenance. Containerbuddy will stop sending heartbeat messages to the discovery service. The discovery service backend's `MarkForMaintenance` method will also be called (in the default Consul implementation, this deregisters the node from Consul).

Delivering a signal to Containerbuddy is most easily done by using `docker exec` and relying on the fact that it is being used as PID1.

```bash
docker exec myapp_1 kill -USR1 1

```


### Contributing

Please report any issues you encounter with Containerbuddy or its documentation by [opening a Github issue](https://github.com/joyent/containerbuddy/issues). Roadmap items will be maintained as [enhancements](https://github.com/joyent/containerbuddy/issues?q=is%3Aopen+is%3Aissue+label%3Aenhancement). PRs are welcome on any issue.
Expand Down
8 changes: 6 additions & 2 deletions src/containerbuddy/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,12 @@ func (b *BackendConfig) CheckForUpstreamChanges() bool {
func (s ServiceConfig) PollTime() int {
return s.Poll
}
func (s *ServiceConfig) WriteHealthCheck() {
s.discoveryService.WriteHealthCheck(s)
func (s *ServiceConfig) SendHeartbeat() {
s.discoveryService.SendHeartbeat(s)
}

func (s *ServiceConfig) MarkForMaintenance() {
s.discoveryService.MarkForMaintenance(s)
}

func loadConfig() (*Config, error) {
Expand Down
11 changes: 9 additions & 2 deletions src/containerbuddy/consul.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,17 @@ func NewConsulConfig(uri string) Consul {
return *config
}

// WriteHealthCheck writes a TTL check status=ok to the consul store.
// MarkForMaintenance removes the node from Consul.
func (c Consul) MarkForMaintenance(service *ServiceConfig) {
if err := c.Agent().ServiceDeregister(service.Id); err != nil {
log.Printf("Deregistering failed: %s\n", err)
}
}

// SendHeartbeat writes a TTL check status=ok to the consul store.
// If consul has never seen this service, we register the service and
// its TTL check.
func (c Consul) WriteHealthCheck(service *ServiceConfig) {
func (c Consul) SendHeartbeat(service *ServiceConfig) {
if err := c.Agent().PassTTL(service.Id, "ok"); err != nil {
log.Printf("%v\nService not registered, registering...", err)
if err = c.registerService(*service); err != nil {
Expand Down
4 changes: 2 additions & 2 deletions src/containerbuddy/discovery.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package main

type DiscoveryService interface {
WriteHealthCheck(*ServiceConfig)
SendHeartbeat(*ServiceConfig)
CheckForUpstreamChanges(*BackendConfig) bool
MarkForMaintenance(*ServiceConfig)
}

10 changes: 5 additions & 5 deletions src/containerbuddy/discovery_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,14 @@ func TestTTLPass(t *testing.T) {
consul := service.discoveryService.(Consul)
id := service.Id

service.WriteHealthCheck() // force registration
service.SendHeartbeat() // force registration
checks, _ := consul.Agent().Checks()
check := checks[id]
if check.Status != "critical" {
t.Fatalf("status of check %s should be 'critical' but is %s", id, check.Status)
}

service.WriteHealthCheck() // write TTL and verify
service.SendHeartbeat() // write TTL and verify
checks, _ = consul.Agent().Checks()
check = checks[id]
if check.Status != "passing" {
Expand All @@ -58,8 +58,8 @@ func TestCheckForChanges(t *testing.T) {
if consul.checkHealth(*backend) {
t.Fatalf("First read of %s should show `false` for change", id)
}
service.WriteHealthCheck() // force registration
service.WriteHealthCheck() // write TTL
service.SendHeartbeat() // force registration
service.SendHeartbeat() // write TTL

if !consul.checkHealth(*backend) {
t.Errorf("%v should have changed after first health check TTL", id)
Expand All @@ -71,7 +71,7 @@ func TestCheckForChanges(t *testing.T) {
if !consul.checkHealth(*backend) {
t.Errorf("%v should have changed after TTL expired.", id)
}
service.WriteHealthCheck() // re-write TTL
service.SendHeartbeat() // re-write TTL

// switch to top-level caller to make sure we have test coverage there
if !backend.CheckForUpstreamChanges() {
Expand Down
10 changes: 8 additions & 2 deletions src/containerbuddy/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
)

func main() {

config, configErr := loadConfig()
if configErr != nil {
log.Fatal(configErr)
Expand All @@ -24,6 +25,9 @@ func main() {
}
}

// Set up signal handler for placing instance into maintenance mode
handleSignals(config)

var quit []chan bool
for _, backend := range config.Backends {
quit = append(quit, poll(backend, checkForChanges, backend.onChangeArgs))
Expand Down Expand Up @@ -67,7 +71,9 @@ func poll(config Pollable, fn pollingFunc, args []string) chan bool {
for {
select {
case <-ticker.C:
fn(config, args)
if !inMaintenanceMode() {
fn(config, args)
}
case <-quit:
return
}
Expand All @@ -82,7 +88,7 @@ func poll(config Pollable, fn pollingFunc, args []string) chan bool {
func checkHealth(pollable Pollable, args []string) {
service := pollable.(*ServiceConfig) // if we pass a bad type here we crash intentionally
if code, _ := run(args); code == 0 {
service.WriteHealthCheck()
service.SendHeartbeat()
}
}

Expand Down
50 changes: 50 additions & 0 deletions src/containerbuddy/signals.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
package main

import (
"log"
"os"
"os/signal"
"sync"
"syscall"
)

// globals are eeeeevil
var paused bool
var pauseLock = sync.RWMutex{}

// we wrap access to `paused` in a RLock so that if we're in the middle of
// marking services for maintenance we don't get stale reads
func inMaintenanceMode() bool {
pauseLock.RLock()
defer pauseLock.RUnlock()
return paused
}

func toggleMaintenanceMode() {
pauseLock.Lock()
defer pauseLock.Unlock()
paused = !paused
}

// Listen for and capture signals from the OS
func handleSignals(config *Config) {
sig := make(chan os.Signal, 1)
signal.Notify(sig, syscall.SIGUSR1)
go func() {
for signal := range sig {
switch signal {
// there's only one handler today but this makes it obvious
// where to add support for new signals
case syscall.SIGUSR1:
toggleMaintenanceMode()
if inMaintenanceMode() {
log.Println("we are paused!")
for _, service := range config.Services {
log.Printf("Marking for maintenance: %s\n", service.Name)
service.MarkForMaintenance()
}
}
}
}
}()
}
44 changes: 44 additions & 0 deletions src/containerbuddy/signals_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
package main

import (
"os"
"os/signal"
"runtime"
"syscall"
"testing"
)

func TestMaintenanceSignal(t *testing.T) {

if inMaintenanceMode() {
t.Errorf("Should not be in maintenance mode before starting handler")
}
handleSignals(&Config{})
if inMaintenanceMode() {
t.Errorf("Should not be in maintenance mode after starting handler")
}

sendAndWaitForSignal(t, syscall.SIGUSR1)
if !inMaintenanceMode() {
t.Errorf("Should be in maintenance mode after receiving SIGUSR1")
}
sendAndWaitForSignal(t, syscall.SIGUSR1)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for testing how to unset maintenance mode.

if inMaintenanceMode() {
t.Errorf("Should not be in maintenance mode after receiving second SIGUSR1")
}
}

// helper to ensure that we block for the signal to deliver any signal
// we need, and then yield execution so that the handler gets a chance
// at running. If we don't do this there's a race where we can check
// resulting side-effects of a handler before it's been run
func sendAndWaitForSignal(t *testing.T, s os.Signal) {
sig := make(chan os.Signal, 1)
signal.Notify(sig, syscall.SIGUSR1)
me, _ := os.FindProcess(os.Getpid())
if err := me.Signal(s); err != nil {
t.Errorf("Got error on SIGUSR1: %v", err)
}
<-sig
runtime.Gosched()
}