-
Notifications
You must be signed in to change notification settings - Fork 4.6k
/
rollingupdate.go
219 lines (175 loc) · 6.41 KB
/
rollingupdate.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package instancegroups
import (
"context"
"fmt"
"sort"
"sync"
"time"
"k8s.io/kops/pkg/client/simple"
"k8s.io/client-go/kubernetes"
"k8s.io/klog/v2"
api "k8s.io/kops/pkg/apis/kops"
"k8s.io/kops/pkg/cloudinstances"
"k8s.io/kops/pkg/validation"
"k8s.io/kops/upup/pkg/fi"
)
// RollingUpdateCluster is a struct containing cluster information for a rolling update.
type RollingUpdateCluster struct {
Clientset simple.Clientset
Ctx context.Context
Cluster *api.Cluster
Cloud fi.Cloud
// MasterInterval is the amount of time to wait after stopping a master instance
MasterInterval time.Duration
// NodeInterval is the amount of time to wait after stopping a non-master instance
NodeInterval time.Duration
// BastionInterval is the amount of time to wait after stopping a bastion instance
BastionInterval time.Duration
// Interactive prompts user to continue after each instance is updated
Interactive bool
Force bool
// K8sClient is the kubernetes client, used for draining etc
K8sClient kubernetes.Interface
// ClusterValidator is used for validating the cluster. Unused if CloudOnly
ClusterValidator validation.ClusterValidator
FailOnDrainError bool
FailOnValidate bool
CloudOnly bool
ClusterName string
// PostDrainDelay is the duration we wait after draining each node
PostDrainDelay time.Duration
// ValidationTimeout is the maximum time to wait for the cluster to validate, once we start validation
ValidationTimeout time.Duration
// ValidateTickDuration is the amount of time to wait between cluster validation attempts
ValidateTickDuration time.Duration
// ValidateSuccessDuration is the amount of time a cluster must continue to validate successfully
// before updating the next node
ValidateSuccessDuration time.Duration
// ValidateCount is the amount of time that a cluster needs to be validated after single node update
ValidateCount int
}
// AdjustNeedUpdate adjusts the set of instances that need updating, using factors outside those known by the cloud implementation
func (*RollingUpdateCluster) AdjustNeedUpdate(groups map[string]*cloudinstances.CloudInstanceGroup) error {
for _, group := range groups {
group.AdjustNeedUpdate()
}
return nil
}
// RollingUpdate performs a rolling update on a K8s Cluster.
func (c *RollingUpdateCluster) RollingUpdate(groups map[string]*cloudinstances.CloudInstanceGroup, instanceGroups *api.InstanceGroupList) error {
if len(groups) == 0 {
klog.Info("Cloud Instance Group length is zero. Not doing a rolling-update.")
return nil
}
var resultsMutex sync.Mutex
results := make(map[string]error)
masterGroups := make(map[string]*cloudinstances.CloudInstanceGroup)
apiServerGroups := make(map[string]*cloudinstances.CloudInstanceGroup)
nodeGroups := make(map[string]*cloudinstances.CloudInstanceGroup)
bastionGroups := make(map[string]*cloudinstances.CloudInstanceGroup)
for k, group := range groups {
switch group.InstanceGroup.Spec.Role {
case api.InstanceGroupRoleNode:
nodeGroups[k] = group
case api.InstanceGroupRoleAPIServer:
apiServerGroups[k] = group
case api.InstanceGroupRoleMaster:
masterGroups[k] = group
case api.InstanceGroupRoleBastion:
bastionGroups[k] = group
default:
return fmt.Errorf("unknown group type for group %q", group.InstanceGroup.ObjectMeta.Name)
}
}
// Upgrade bastions first; if these go down we can't see anything
{
var wg sync.WaitGroup
for _, k := range sortGroups(bastionGroups) {
wg.Add(1)
go func(k string) {
resultsMutex.Lock()
results[k] = fmt.Errorf("function panic bastions")
resultsMutex.Unlock()
defer wg.Done()
err := c.rollingUpdateInstanceGroup(bastionGroups[k], c.BastionInterval)
resultsMutex.Lock()
results[k] = err
resultsMutex.Unlock()
}(k)
}
wg.Wait()
}
// Do not continue update if bastion(s) failed
for _, err := range results {
if err != nil {
return fmt.Errorf("bastion not healthy after update, stopping rolling-update: %q", err)
}
}
// Upgrade masters next
{
// We run master nodes in series, even if they are in separate instance groups
// typically they will be in separate instance groups, so we can force the zones,
// and we don't want to roll all the masters at the same time. See issue #284
for _, k := range sortGroups(masterGroups) {
err := c.rollingUpdateInstanceGroup(masterGroups[k], c.MasterInterval)
// Do not continue update if master(s) failed, cluster is potentially in an unhealthy state
if err != nil {
return fmt.Errorf("master not healthy after update, stopping rolling-update: %q", err)
}
}
}
// Upgrade API servers
{
for k := range apiServerGroups {
results[k] = fmt.Errorf("function panic apiservers")
}
for _, k := range sortGroups(apiServerGroups) {
err := c.rollingUpdateInstanceGroup(apiServerGroups[k], c.NodeInterval)
results[k] = err
// TODO: Bail on error?
}
}
// Upgrade nodes
{
// We run nodes in series, even if they are in separate instance groups
// typically they will not being separate instance groups. If you roll the nodes in parallel
// you can get into a scenario where you can evict multiple statefulset pods from the same
// statefulset at the same time. Further improvements needs to be made to protect from this as
// well.
for k := range nodeGroups {
results[k] = fmt.Errorf("function panic nodes")
}
for _, k := range sortGroups(nodeGroups) {
err := c.rollingUpdateInstanceGroup(nodeGroups[k], c.NodeInterval)
results[k] = err
// TODO: Bail on error?
}
}
for _, err := range results {
if err != nil {
return err
}
}
klog.Infof("Rolling update completed for cluster %q!", c.ClusterName)
return nil
}
func sortGroups(groupMap map[string]*cloudinstances.CloudInstanceGroup) []string {
groups := make([]string, 0, len(groupMap))
for group := range groupMap {
groups = append(groups, group)
}
sort.Strings(groups)
return groups
}