forked from hashicorp/nomad
-
Notifications
You must be signed in to change notification settings - Fork 0
/
script.go
169 lines (148 loc) · 4.52 KB
/
script.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
package consul
import (
"context"
"log"
"time"
metrics "github.com/armon/go-metrics"
"github.com/hashicorp/consul/api"
"github.com/hashicorp/nomad/client/driver"
"github.com/hashicorp/nomad/nomad/structs"
)
// heartbeater is the subset of consul agent functionality needed by script
// checks to heartbeat
type heartbeater interface {
UpdateTTL(id, output, status string) error
}
// scriptHandle is returned by scriptCheck.run by cancelling a scriptCheck and
// waiting for it to shutdown.
type scriptHandle struct {
// cancel the script
cancel func()
exitCh chan struct{}
}
// wait returns a chan that's closed when the script exits
func (s *scriptHandle) wait() <-chan struct{} {
return s.exitCh
}
// scriptCheck runs script checks via a ScriptExecutor and updates the
// appropriate check's TTL when the script succeeds.
type scriptCheck struct {
allocID string
taskName string
id string
check *structs.ServiceCheck
exec driver.ScriptExecutor
agent heartbeater
// lastCheckOk is true if the last check was ok; otherwise false
lastCheckOk bool
logger *log.Logger
shutdownCh <-chan struct{}
}
// newScriptCheck creates a new scriptCheck. run() should be called once the
// initial check is registered with Consul.
func newScriptCheck(allocID, taskName, checkID string, check *structs.ServiceCheck,
exec driver.ScriptExecutor, agent heartbeater, logger *log.Logger,
shutdownCh <-chan struct{}) *scriptCheck {
return &scriptCheck{
allocID: allocID,
taskName: taskName,
id: checkID,
check: check,
exec: exec,
agent: agent,
lastCheckOk: true, // start logging on first failure
logger: logger,
shutdownCh: shutdownCh,
}
}
// run this script check and return its cancel func. If the shutdownCh is
// closed the check will be run once more before exiting.
func (s *scriptCheck) run() *scriptHandle {
ctx, cancel := context.WithCancel(context.Background())
exitCh := make(chan struct{})
go func() {
defer close(exitCh)
timer := time.NewTimer(0)
defer timer.Stop()
for {
// Block until check is removed, Nomad is shutting
// down, or the check interval is up
select {
case <-ctx.Done():
// check has been removed
return
case <-s.shutdownCh:
// unblock but don't exit until after we heartbeat once more
case <-timer.C:
timer.Reset(s.check.Interval)
}
metrics.IncrCounter([]string{"client", "consul", "script_runs"}, 1)
// Execute check script with timeout
execctx, cancel := context.WithTimeout(ctx, s.check.Timeout)
output, code, err := s.exec.Exec(execctx, s.check.Command, s.check.Args)
switch execctx.Err() {
case context.Canceled:
// check removed during execution; exit
cancel()
return
case context.DeadlineExceeded:
metrics.IncrCounter([]string{"client", "consul", "script_timeouts"}, 1)
// If no error was returned, set one to make sure the task goes critical
if err == nil {
err = context.DeadlineExceeded
}
// Log deadline exceeded every time as it's a
// distinct issue from checks returning
// failures
s.logger.Printf("[WARN] consul.checks: check %q for task %q alloc %q timed out (%s)",
s.check.Name, s.taskName, s.allocID, s.check.Timeout)
}
// cleanup context
cancel()
state := api.HealthCritical
switch code {
case 0:
state = api.HealthPassing
case 1:
state = api.HealthWarning
}
var outputMsg string
if err != nil {
state = api.HealthCritical
outputMsg = err.Error()
} else {
outputMsg = string(output)
}
// Actually heartbeat the check
err = s.agent.UpdateTTL(s.id, outputMsg, state)
select {
case <-ctx.Done():
// check has been removed; don't report errors
return
default:
}
if err != nil {
if s.lastCheckOk {
s.lastCheckOk = false
s.logger.Printf("[WARN] consul.checks: update for task %q alloc %q check %q failed: %v",
s.taskName, s.allocID, s.check.Name, err)
} else {
s.logger.Printf("[DEBUG] consul.checks: update for task %q alloc %q check %q still failing: %v",
s.taskName, s.allocID, s.check.Name, err)
}
} else if !s.lastCheckOk {
// Succeeded for the first time or after failing; log
s.lastCheckOk = true
s.logger.Printf("[INFO] consul.checks: update for task %q alloc %q check %q succeeded",
s.taskName, s.allocID, s.check.Name)
}
select {
case <-s.shutdownCh:
// We've been told to exit and just heartbeated so exit
return
default:
}
}
}()
return &scriptHandle{cancel: cancel, exitCh: exitCh}
}