This repository has been archived by the owner on Apr 27, 2024. It is now read-only.
forked from pterodactyl/wings
/
power.go
348 lines (303 loc) · 12.1 KB
/
power.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
package kubernetes
import (
"context"
"fmt"
"time"
errors2 "emperror.dev/errors"
"github.com/apex/log"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
"github.com/kubectyl/kuber/config"
"github.com/kubectyl/kuber/environment"
"github.com/kubectyl/kuber/remote"
)
// OnBeforeStart run before the container starts and get the process
// configuration from the Panel. This is important since we use this to check
// configuration files as well as ensure we always have the latest version of
// a rocket available for server processes.
//
// This process will also confirm that the server environment exists and is in
// a bootable state. This ensures that unexpected container deletion while Kuber
// is running does not result in the server becoming un-bootable.
func (e *Environment) OnBeforeStart(ctx context.Context) error {
// Always destroy and re-create the server container to ensure that synced data from the Panel is used.
var zero int64 = 0
policy := metav1.DeletePropagationForeground
if err := e.client.CoreV1().Pods(config.Get().Cluster.Namespace).Delete(ctx, e.Id, metav1.DeleteOptions{GracePeriodSeconds: &zero, PropagationPolicy: &policy}); err != nil {
if !errors.IsNotFound(err) {
return errors2.WrapIf(err, "environment/kubernetes: failed to remove pod during pre-boot")
}
}
err := wait.Poll(time.Second, time.Second*10, func() (bool, error) {
_, err := e.client.CoreV1().Pods(config.Get().Cluster.Namespace).Get(context.TODO(), e.Id, metav1.GetOptions{})
if err != nil {
if errors.IsNotFound(err) {
return true, nil
}
return true, err
}
return false, nil
})
if err != nil {
return err
}
// The Create() function will check if the container exists in the first place, and if
// so just silently return without an error. Otherwise, it will try to create the necessary
// container and data storage directory.
//
// This won't actually run an installation process however, it is just here to ensure the
// environment gets created properly if it is missing and the server is started. We're making
// an assumption that all the files will still exist at this point.
if err := e.Create(); err != nil {
return err
}
return nil
}
// Start will start the server environment and begins piping output to the event
// listeners for the console. If a container does not exist, or needs to be
// rebuilt that will happen in the call to OnBeforeStart().
func (e *Environment) Start(ctx context.Context) error {
sawError := false
// If sawError is set to true there was an error somewhere in the pipeline that
// got passed up, but we also want to ensure we set the server to be offline at
// that point.
defer func() {
if sawError {
// If we don't set it to stopping first, you'll trigger crash detection which
// we don't want to do at this point since it'll just immediately try to do the
// exact same action that lead to it crashing in the first place...
e.SetState(environment.ProcessStoppingState)
e.SetState(environment.ProcessOfflineState)
}
}()
if c, err := e.client.CoreV1().Pods(config.Get().Cluster.Namespace).Get(ctx, e.Id, metav1.GetOptions{}); err != nil {
// Do nothing if the container is not found, we just don't want to continue
// to the next block of code here. This check was inlined here to guard against
// a nil-pointer when checking c.State below.
if !errors.IsNotFound(err) {
return errors2.WrapIf(err, "environment/kubernetes: failed to inspect pod")
}
} else {
// If the server is running update our internal state and continue on with the attach.
if running, _ := e.IsRunning(ctx); running && c.Status.Phase == v1.PodRunning {
e.SetState(environment.ProcessRunningState)
ctx, cancel := context.WithCancel(context.Background())
go func(ctx context.Context) {
err := wait.PollInfinite(time.Second, func() (bool, error) {
if running, err := e.IsRunning(ctx); !running {
cancel()
return true, err
}
return false, err
})
if err != nil {
e.SetState(environment.ProcessOfflineState)
}
}(ctx)
return e.Attach(ctx)
}
}
e.SetState(environment.ProcessStartingState)
// Set this to true for now, we will set it to false once we reach the
// end of this chain.
sawError = true
// Run the before start function and wait for it to finish. This will validate that the container
// exists on the system, and rebuild the container if that is required for server booting to
// occur.
if err := e.OnBeforeStart(ctx); err != nil {
return errors2.WithStackIf(err)
}
// If we cannot start & attach to the container in 30 seconds something has gone
// quite sideways, and we should stop trying to avoid a hanging situation.
actx, cancel := context.WithTimeout(ctx, time.Second*30)
defer cancel()
err := wait.PollImmediate(time.Second, 10*time.Minute, func() (bool, error) {
pod, err := e.client.CoreV1().Pods(config.Get().Cluster.Namespace).Get(context.Background(), e.Id, metav1.GetOptions{})
if err != nil {
return false, err
}
switch pod.Status.Phase {
case v1.PodPending:
e.SetState(environment.ProcessStartingState)
case v1.PodRunning:
return true, nil
case v1.PodFailed, v1.PodSucceeded:
return false, fmt.Errorf("pod ran to completion")
default:
return false, fmt.Errorf("unknown pod status")
}
return false, nil
})
if err != nil {
e.SetState(environment.ProcessOfflineState)
return e.Terminate(context.Background())
}
// You must attach to the instance _before_ you start the container. If you do this
// in the opposite order you'll enter a deadlock condition where we're attached to
// the instance successfully, but the container has already stopped and you'll get
// the entire program into a very confusing state.
//
// By explicitly attaching to the instance before we start it, we can immediately
// react to errors/output stopping/etc. when starting.
if err := e.Attach(actx); err != nil {
return err
}
// No errors, good to continue through.
sawError = false
return nil
}
// Stop stops the container that the server is running in. This will allow up to
// 30 seconds to pass before the container is forcefully terminated if we are
// trying to stop it without using a command sent into the instance.
//
// You most likely want to be using WaitForStop() rather than this function,
// since this will return as soon as the command is sent, rather than waiting
// for the process to be completed stopped.
func (e *Environment) Stop(ctx context.Context) error {
e.mu.RLock()
s := e.meta.Stop
e.mu.RUnlock()
// A native "stop" as the Type field value will just skip over all of this
// logic and end up only executing the container stop command (which may or
// may not work as expected).
if s.Type == "" || s.Type == remote.ProcessStopSignal {
if s.Type == "" {
log.WithField("pod_name", e.Id).Warn("no stop configuration detected for environment, using termination procedure")
}
return e.Terminate(ctx)
}
waitPoll := func(ctx context.Context) error {
err := wait.PollUntilWithContext(ctx, time.Second, func(ctx context.Context) (done bool, err error) {
running, err := e.IsRunning(ctx)
if err != nil && !errors.IsNotFound(err) {
return false, err
}
if !running {
e.SetStream(nil)
e.SetState(environment.ProcessOfflineState)
return true, nil
}
return false, nil
})
if err != nil {
e.log().WithField("error", err).Warn("error while waiting for pod stop")
}
return nil
}
// Only attempt to send the stop command to the instance if we are actually attached to
// the instance. If we are not for some reason, just send the container stop event.
if e.IsAttached() &&
s.Type == remote.ProcessStopCommand &&
e.st.Load() == environment.ProcessRunningState {
e.SendCommand(s.Value)
if err := waitPoll(ctx); err != nil {
return err
}
}
// If the process is already offline don't switch it back to stopping. Just leave it how
// it is and continue through to the stop handling for the process.
if e.st.Load() != environment.ProcessOfflineState {
e.SetState(environment.ProcessStoppingState)
}
// Allow the stop action to run for however long it takes, similar to executing a command
// and using a different logic pathway to wait for the container to stop successfully.
//
// Using a negative timeout here will allow the container to stop gracefully,
// rather than forcefully terminating it, this value MUST be at least 1
// second, otherwise it will be ignored.
if err := e.client.CoreV1().Pods(config.Get().Cluster.Namespace).Delete(ctx, e.Id, metav1.DeleteOptions{}); err != nil {
// If the pod does not exist just mark the process as stopped and return without an error.
if errors.IsNotFound(err) {
e.SetStream(nil)
e.SetState(environment.ProcessOfflineState)
return nil
}
return errors2.Wrap(err, "environment/kubernetes: cannot stop pod")
}
if err := waitPoll(ctx); err != nil {
return err
}
return nil
}
// WaitForStop attempts to gracefully stop a server using the defined stop
// command. If the server does not stop after seconds have passed, an error will
// be returned, or the instance will be terminated forcefully depending on the
// value of the second argument.
//
// Calls to Environment.Terminate() in this function use the context passed
// through since we don't want to prevent termination of the server instance
// just because the context.WithTimeout() has expired.
func (e *Environment) WaitForStop(ctx context.Context, duration time.Duration, terminate bool) error {
tctx, cancel := context.WithTimeout(context.Background(), duration)
defer cancel()
// If the parent context is canceled, abort the timed context for termination.
go func() {
select {
case <-ctx.Done():
cancel()
case <-tctx.Done():
// When the timed context is canceled, terminate this routine since we no longer
// need to worry about the parent routine being canceled.
break
}
}()
doTermination := func(s string) error {
e.log().WithField("step", s).WithField("duration", duration).Warn("container stop did not complete in time, terminating process...")
return e.Terminate(ctx)
}
// We pass through the timed context for this stop action so that if one of the
// internal kubernetes calls fails to ever finish before we've exhausted the time limit
// the resources get cleaned up, and the exection is stopped.
if err := e.Stop(tctx); err != nil {
if terminate && errors2.Is(err, context.DeadlineExceeded) {
return doTermination("stop")
}
return err
}
// Block the return of this function until the container as been marked as no
// longer running. If this wait does not end by the time seconds have passed,
// attempt to terminate the container, or return an error.
err := wait.PollUntilWithContext(tctx, time.Second, func(context.Context) (bool, error) {
_, err := e.client.CoreV1().Pods(config.Get().Cluster.Namespace).Get(tctx, e.Id, metav1.GetOptions{})
if err != nil {
if errors.IsNotFound(err) {
return true, nil
}
return true, err
}
return false, nil
})
if terminate && err != nil {
if !errors2.Is(err, context.DeadlineExceeded) {
e.log().WithField("error", err).Warn("error while waiting for pod stop; terminating process")
}
return doTermination("wait")
}
return nil
}
// Terminate forcefully terminates the container using the signal provided.
func (e *Environment) Terminate(ctx context.Context) error {
_, err := e.client.CoreV1().Pods(config.Get().Cluster.Namespace).Get(ctx, e.Id, metav1.GetOptions{})
if err != nil {
// Treat missing containers as an okay error state, means it is obviously
// already terminated at this point.
if errors.IsNotFound(err) {
return nil
}
return errors2.WithStack(err)
}
// We set it to stopping than offline to prevent crash detection from being triggered.
e.SetState(environment.ProcessStoppingState)
var zero int64 = 0
policy := metav1.DeletePropagationForeground
if err := e.client.CoreV1().Pods(config.Get().Cluster.Namespace).Delete(ctx, e.Id, metav1.DeleteOptions{
GracePeriodSeconds: &zero,
PropagationPolicy: &policy,
}); err != nil && !errors.IsNotFound(err) {
return errors2.WithStack(err)
}
e.SetState(environment.ProcessOfflineState)
return nil
}