/
prometheus.go
550 lines (501 loc) · 24.3 KB
/
prometheus.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package prometheus
import (
"context"
"embed"
"encoding/json"
"errors"
"fmt"
"io/fs"
"os"
"sync"
"time"
"golang.org/x/sync/errgroup"
corev1 "k8s.io/api/core/v1"
rbacv1 "k8s.io/api/rbac/v1"
apierrs "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/kubernetes"
"k8s.io/klog/v2"
"k8s.io/perf-tests/clusterloader2/pkg/config"
clerrors "k8s.io/perf-tests/clusterloader2/pkg/errors"
"k8s.io/perf-tests/clusterloader2/pkg/flags"
"k8s.io/perf-tests/clusterloader2/pkg/framework"
"k8s.io/perf-tests/clusterloader2/pkg/framework/client"
"k8s.io/perf-tests/clusterloader2/pkg/provider"
"k8s.io/perf-tests/clusterloader2/pkg/util"
)
const (
namespace = "monitoring"
storageClass = "ssd"
checkPrometheusReadyInterval = 30 * time.Second
numK8sClients = 1
// All paths here are relative to manifests dir.
coreManifests = "*.yaml"
defaultServiceMonitors = "default/*.yaml"
kubeStateMetricsManifests = "exporters/kube-state-metrics/*.yaml"
masterIPServiceMonitors = "master-ip/*.yaml"
metricsServerManifests = "exporters/metrics-server/*.yaml"
nodeExporterPod = "exporters/node_exporter/node-exporter.yaml"
windowsNodeExporterManifests = "exporters/windows_node_exporter/*.yaml"
pushgatewayManifests = "pushgateway/*.yaml"
)
//go:embed manifests
var manifestsFSWithPrefix embed.FS
var manifestsFS fs.FS
func init() {
var err error
// go's embed generates embed.FS with all files with 'manifests/' prefix.
// To be consistent with --prometheus-manifest-path (which is defined inside of manifests) we need to drip this prefix.
manifestsFS, err = fs.Sub(manifestsFSWithPrefix, "manifests")
if err != nil {
panic(fmt.Sprintf("failed to strip manifests prefix: %v", err))
}
}
// InitFlags initializes prometheus flags.
func InitFlags(p *config.PrometheusConfig) {
flags.BoolEnvVar(&p.EnableServer, "enable-prometheus-server", "ENABLE_PROMETHEUS_SERVER", false, "Whether to set-up the prometheus server in the cluster.")
flags.BoolEnvVar(&p.TearDownServer, "tear-down-prometheus-server", "TEAR_DOWN_PROMETHEUS_SERVER", true, "Whether to tear-down the prometheus server after tests (if set-up).")
flags.BoolEnvVar(&p.EnablePushgateway, "enable-pushgateway", "PROMETHEUS_ENABLE_PUSHGATEWAY", false, "Whether to set-up the Pushgateway. Only work with enabled Prometheus server.")
flags.BoolEnvVar(&p.ScrapeEtcd, "prometheus-scrape-etcd", "PROMETHEUS_SCRAPE_ETCD", false, "Whether to scrape etcd metrics.")
flags.BoolEnvVar(&p.ScrapeNodeExporter, "prometheus-scrape-node-exporter", "PROMETHEUS_SCRAPE_NODE_EXPORTER", false, "Whether to scrape node exporter metrics.")
flags.BoolEnvVar(&p.ScrapeWindowsNodeExporter, "prometheus-scrape-windows-node-exporter", "PROMETHEUS_SCRAPE_WINDOWS_NODE_EXPORTER", false, "Whether to scrape Windows node exporter metrics.")
flags.BoolEnvVar(&p.ScrapeKubelets, "prometheus-scrape-kubelets", "PROMETHEUS_SCRAPE_KUBELETS", false, "Whether to scrape kubelets (nodes + master). Experimental, may not work in larger clusters. Requires heapster node to be at least n1-standard-4, which needs to be provided manually.")
flags.BoolEnvVar(&p.ScrapeMasterKubelets, "prometheus-scrape-master-kubelets", "PROMETHEUS_SCRAPE_MASTER_KUBELETS", false, "Whether to scrape kubelets running on master nodes.")
flags.BoolEnvVar(&p.ScrapeKubeProxy, "prometheus-scrape-kube-proxy", "PROMETHEUS_SCRAPE_KUBE_PROXY", true, "Whether to scrape kube proxy.")
flags.StringEnvVar(&p.KubeProxySelectorKey, "prometheus-kube-proxy-selector-key", "PROMETHEUS_KUBE_PROXY_SELECTOR_KEY", "component", "Label key used to scrape kube proxy.")
flags.BoolEnvVar(&p.ScrapeKubeStateMetrics, "prometheus-scrape-kube-state-metrics", "PROMETHEUS_SCRAPE_KUBE_STATE_METRICS", false, "Whether to scrape kube-state-metrics. Only run occasionally.")
flags.BoolEnvVar(&p.ScrapeMetricsServerMetrics, "prometheus-scrape-metrics-server", "PROMETHEUS_SCRAPE_METRICS_SERVER_METRICS", false, "Whether to scrape metrics-server. Only run occasionally.")
flags.BoolEnvVar(&p.ScrapeNodeLocalDNS, "prometheus-scrape-node-local-dns", "PROMETHEUS_SCRAPE_NODE_LOCAL_DNS", false, "Whether to scrape node-local-dns pods.")
flags.BoolEnvVar(&p.ScrapeAnet, "prometheus-scrape-anet", "PROMETHEUS_SCRAPE_ANET", false, "Whether to scrape anet pods.")
flags.BoolEnvVar(&p.ScrapeMastersWithPublicIPs, "prometheus-scrape-masters-with-public-ips", "PROMETHEUS_SCRAPE_MASTERS_WITH_PUBLIC_IPS", false, "Whether to scrape master machines using public ips, instead of private.")
flags.IntEnvVar(&p.APIServerScrapePort, "prometheus-apiserver-scrape-port", "PROMETHEUS_APISERVER_SCRAPE_PORT", 443, "Port for scraping kube-apiserver (default 443).")
flags.StringEnvVar(&p.SnapshotProject, "experimental-snapshot-project", "PROJECT", "", "GCP project used where disks and snapshots are located.")
flags.StringEnvVar(&p.ManifestPath, "prometheus-manifest-path", "PROMETHEUS_MANIFEST_PATH", "", "Path to the prometheus manifest files.")
flags.StringEnvVar(&p.StorageClassProvisioner, "prometheus-storage-class-provisioner", "PROMETHEUS_STORAGE_CLASS_PROVISIONER", "kubernetes.io/gce-pd", "Volumes plugin used to provision PVs for Prometheus.")
flags.StringEnvVar(&p.StorageClassVolumeType, "prometheus-storage-class-volume-type", "PROMETHEUS_STORAGE_CLASS_VOLUME_TYPE", "pd-ssd", "Volume types of storage class, This will be different depending on the provisioner.")
flags.StringEnvVar(&p.PVCStorageClass, "prometheus-pvc-storage-class", "PROMETHEUS_PVC_STORAGE_CLASS", "ssd", "Storage class used with prometheus persistent volume claim.")
flags.DurationEnvVar(&p.ReadyTimeout, "prometheus-ready-timeout", "PROMETHEUS_READY_TIMEOUT", 15*time.Minute, "Timeout for waiting for Prometheus stack to become healthy.")
flags.StringEnvVar(&p.PrometheusMemoryRequest, "prometheus-memory-request", "PROMETHEUS_MEMORY_REQUEST", "10Gi", "Memory request to be used by promehteus.")
err := flags.MarkDeprecated("prometheus-manifest-path", "prometheus manifests are now taken from the embed FS prepared in the build time. This flag is planned to be removed in Jan 2023. Do you really need this flag?")
if err != nil {
klog.Fatalf("unable to mark flag prometheus-manifest-path deprecated %v", err)
}
}
// ValidatePrometheusFlags validates prometheus flags.
func ValidatePrometheusFlags(p *config.PrometheusConfig) *clerrors.ErrorList {
errList := clerrors.NewErrorList()
if *shouldSnapshotPrometheusDisk && p.SnapshotProject == "" {
errList.Append(fmt.Errorf("requesting snapshot, but snapshot project not configured. Use --experimental-snapshot-project flag"))
}
return errList
}
// Controller is a util for managing (setting up / tearing down) the prometheus stack in
// the cluster.
type Controller struct {
clusterLoaderConfig *config.ClusterLoaderConfig
// provider is the cloud provider derived from the --provider flag.
provider provider.Provider
// framework associated with the cluster where the prometheus stack should be set up.
// For kubemark it's the root cluster, otherwise it's the main (and only) cluster.
framework *framework.Framework
// templateMapping is a mapping defining placeholders used in manifest templates.
templateMapping map[string]interface{}
// diskMetadata store name and zone of Prometheus persistent disk.
diskMetadata prometheusDiskMetadata
// snapshotLock makes sure that only single Prometheus snapshot is happening
snapshotLock sync.Mutex
// snapshotted is a check if the Prometheus snapshot is already done - protected by snapshotLock
snapshotted bool
// snapshotError contains error from snapshot attempt - protected by snapshotLock
snapshotError error
// ssh executor to run commands in cluster nodes via ssh
ssh util.SSHExecutor
// timeout for waiting for Prometheus stack to become healthy
readyTimeout time.Duration
}
// NewController creates a new instance of Controller for the given config.
func NewController(clusterLoaderConfig *config.ClusterLoaderConfig) (pc *Controller, err error) {
pc = &Controller{
clusterLoaderConfig: clusterLoaderConfig,
provider: clusterLoaderConfig.ClusterConfig.Provider,
readyTimeout: clusterLoaderConfig.PrometheusConfig.ReadyTimeout,
}
if pc.framework, err = framework.NewRootFramework(&clusterLoaderConfig.ClusterConfig, numK8sClients); err != nil {
return nil, err
}
mapping, errList := config.GetMapping(clusterLoaderConfig, nil)
if errList != nil {
return nil, errList
}
mapping["MasterIps"], err = getMasterIps(clusterLoaderConfig.ClusterConfig, clusterLoaderConfig.PrometheusConfig.ScrapeMastersWithPublicIPs)
if err != nil {
klog.Warningf("Couldn't get master ip, will ignore manifests requiring it: %v", err)
delete(mapping, "MasterIps")
}
if _, exists := mapping["PROMETHEUS_SCRAPE_APISERVER_ONLY"]; !exists {
mapping["PROMETHEUS_SCRAPE_APISERVER_ONLY"] = clusterLoaderConfig.ClusterConfig.Provider.Features().ShouldPrometheusScrapeApiserverOnly
}
// TODO: Change to pure assignments when overrides are not used.
if _, exists := mapping["PROMETHEUS_SCRAPE_ETCD"]; !exists {
mapping["PROMETHEUS_SCRAPE_ETCD"] = clusterLoaderConfig.PrometheusConfig.ScrapeEtcd
} else {
// Backward compatibility.
clusterLoaderConfig.PrometheusConfig.ScrapeEtcd = mapping["PROMETHEUS_SCRAPE_ETCD"].(bool)
}
if _, exists := mapping["PROMETHEUS_SCRAPE_NODE_EXPORTER"]; !exists {
mapping["PROMETHEUS_SCRAPE_NODE_EXPORTER"] = clusterLoaderConfig.PrometheusConfig.ScrapeNodeExporter
} else {
// Backward compatibility.
clusterLoaderConfig.PrometheusConfig.ScrapeNodeExporter = mapping["PROMETHEUS_SCRAPE_NODE_EXPORTER"].(bool)
}
if _, exists := mapping["PROMETHEUS_SCRAPE_WINDOWS_NODE_EXPORTER"]; !exists {
mapping["PROMETHEUS_SCRAPE_WINDOWS_NODE_EXPORTER"] = clusterLoaderConfig.PrometheusConfig.ScrapeWindowsNodeExporter
} else {
// Backward compatibility.
clusterLoaderConfig.PrometheusConfig.ScrapeWindowsNodeExporter = mapping["PROMETHEUS_SCRAPE_WINDOWS_NODE_EXPORTER"].(bool)
}
if _, exists := mapping["PROMETHEUS_SCRAPE_KUBE_PROXY"]; !exists {
clusterLoaderConfig.PrometheusConfig.ScrapeKubeProxy = clusterLoaderConfig.ClusterConfig.Provider.Features().ShouldScrapeKubeProxy
mapping["PROMETHEUS_SCRAPE_KUBE_PROXY"] = clusterLoaderConfig.PrometheusConfig.ScrapeKubeProxy
} else {
// Backward compatibility
clusterLoaderConfig.PrometheusConfig.ScrapeKubeProxy = mapping["PROMETHEUS_SCRAPE_KUBE_PROXY"].(bool)
}
if _, exists := mapping["PROMETHEUS_SCRAPE_ANET"]; !exists {
mapping["PROMETHEUS_SCRAPE_ANET"] = clusterLoaderConfig.PrometheusConfig.ScrapeAnet
} else {
clusterLoaderConfig.PrometheusConfig.ScrapeAnet = mapping["PROMETHEUS_SCRAPE_ANET"].(bool)
}
mapping["PROMETHEUS_SCRAPE_NODE_LOCAL_DNS"] = clusterLoaderConfig.PrometheusConfig.ScrapeNodeLocalDNS
mapping["PROMETHEUS_SCRAPE_KUBE_STATE_METRICS"] = clusterLoaderConfig.PrometheusConfig.ScrapeKubeStateMetrics
mapping["PROMETHEUS_SCRAPE_METRICS_SERVER_METRICS"] = clusterLoaderConfig.PrometheusConfig.ScrapeMetricsServerMetrics
mapping["PROMETHEUS_SCRAPE_KUBELETS"] = clusterLoaderConfig.PrometheusConfig.ScrapeKubelets
mapping["PROMETHEUS_SCRAPE_MASTER_KUBELETS"] = clusterLoaderConfig.PrometheusConfig.ScrapeKubelets || clusterLoaderConfig.PrometheusConfig.ScrapeMasterKubelets
mapping["PROMETHEUS_APISERVER_SCRAPE_PORT"] = clusterLoaderConfig.PrometheusConfig.APIServerScrapePort
mapping["PROMETHEUS_STORAGE_CLASS_PROVISIONER"] = clusterLoaderConfig.PrometheusConfig.StorageClassProvisioner
mapping["PROMETHEUS_STORAGE_CLASS_VOLUME_TYPE"] = clusterLoaderConfig.PrometheusConfig.StorageClassVolumeType
mapping["PROMETHEUS_KUBE_PROXY_SELECTOR_KEY"] = clusterLoaderConfig.PrometheusConfig.KubeProxySelectorKey
mapping["PROMETHEUS_PVC_STORAGE_CLASS"] = clusterLoaderConfig.PrometheusConfig.PVCStorageClass
mapping["PROMETHEUS_MEMORY_REQUEST"] = clusterLoaderConfig.PrometheusConfig.PrometheusMemoryRequest
snapshotEnabled, _ := pc.isEnabled()
mapping["RetainPD"] = snapshotEnabled
pc.templateMapping = mapping
pc.ssh = &util.GCloudSSHExecutor{}
return pc, nil
}
// SetUpPrometheusStack sets up prometheus stack in the cluster.
// This method is idempotent, if the prometheus stack is already set up applying the manifests
// again will be no-op.
func (pc *Controller) SetUpPrometheusStack() error {
k8sClient := pc.framework.GetClientSets().GetClient()
klog.V(2).Info("Setting up prometheus stack")
if err := client.CreateNamespace(k8sClient, namespace); err != nil {
return err
}
// Removing Storage Class as Reclaim Policy cannot be changed
if err := client.DeleteStorageClass(k8sClient, storageClass); err != nil {
return err
}
if err := pc.applyManifests(coreManifests); err != nil {
return err
}
if pc.clusterLoaderConfig.PrometheusConfig.ScrapeNodeExporter {
if err := pc.runNodeExporter(); err != nil {
return err
}
}
if pc.clusterLoaderConfig.PrometheusConfig.ScrapeWindowsNodeExporter {
if err := pc.applyManifests(windowsNodeExporterManifests); err != nil {
return err
}
} else {
// Backward compatibility
// If enabled scraping windows node, need to setup windows node and template mapping
if isWindowsNodeScrapingEnabled(pc.templateMapping, pc.clusterLoaderConfig) {
if err := setUpWindowsNodeAndTemplate(k8sClient, pc.templateMapping); err != nil {
return err
}
}
}
if !pc.isKubemark() {
if err := pc.applyManifests(defaultServiceMonitors); err != nil {
return err
}
}
if pc.clusterLoaderConfig.PrometheusConfig.ScrapeKubeStateMetrics && pc.clusterLoaderConfig.ClusterConfig.Provider.Features().SupportKubeStateMetrics {
klog.V(2).Infof("Applying kube-state-metrics in the cluster.")
if err := pc.applyManifests(kubeStateMetricsManifests); err != nil {
return err
}
}
if pc.clusterLoaderConfig.PrometheusConfig.ScrapeMetricsServerMetrics && pc.clusterLoaderConfig.ClusterConfig.Provider.Features().SupportMetricsServerMetrics {
klog.V(2).Infof("Applying metrics server in the cluster.")
if err := pc.applyManifests(metricsServerManifests); err != nil {
return err
}
}
if _, ok := pc.templateMapping["MasterIps"]; ok {
if err := pc.exposeAPIServerMetrics(); err != nil {
return err
}
if err := pc.applyManifests(masterIPServiceMonitors); err != nil {
return err
}
}
if pc.clusterLoaderConfig.PrometheusConfig.EnablePushgateway {
klog.V(2).Infof("Applying Pushgateway in the cluster.")
if err := pc.applyManifests(pushgatewayManifests); err != nil {
return err
}
}
if err := pc.waitForPrometheusToBeHealthy(); err != nil {
dumpAdditionalLogsOnPrometheusSetupFailure(k8sClient)
return err
}
klog.V(2).Info("Prometheus stack set up successfully")
if err := pc.cachePrometheusDiskMetadataIfEnabled(); err != nil {
klog.Warningf("Error while caching prometheus disk metadata: %v", err)
}
return nil
}
func (pc *Controller) MakePrometheusSnapshotIfEnabled() error {
klog.V(2).Info("Get snapshot from Prometheus")
if err := pc.snapshotPrometheusIfEnabled(); err != nil {
klog.Warningf("Error while getting prometheus snapshot: %v", err)
return err
}
return nil
}
// TearDownPrometheusStack tears down prometheus stack, releasing all prometheus resources.
func (pc *Controller) TearDownPrometheusStack() error {
// Get disk metadata again to be sure
if err := pc.cachePrometheusDiskMetadataIfEnabled(); err != nil {
klog.Warningf("Error while caching prometheus disk metadata: %v", err)
}
defer func() {
klog.V(2).Info("Snapshotting prometheus disk")
if err := pc.snapshotPrometheusDiskIfEnabledSynchronized(); err != nil {
klog.Warningf("Error while snapshotting prometheus disk: %v", err)
}
if err := pc.deletePrometheusDiskIfEnabled(); err != nil {
klog.Warningf("Error while deleting prometheus disk: %v", err)
}
}()
klog.V(2).Info("Tearing down prometheus stack")
k8sClient := pc.framework.GetClientSets().GetClient()
if err := client.DeleteNamespace(k8sClient, namespace); err != nil {
return err
}
if err := client.WaitForDeleteNamespace(k8sClient, namespace, client.DefaultNamespaceDeletionTimeout); err != nil {
return err
}
return nil
}
// GetFramework returns prometheus framework.
func (pc *Controller) GetFramework() *framework.Framework {
return pc.framework
}
func (pc *Controller) applyManifests(manifestGlob string) error {
return pc.framework.ApplyTemplatedManifests(
pc.manifestsFS(), manifestGlob, pc.templateMapping, client.Retry(apierrs.IsNotFound))
}
func (pc *Controller) manifestsFS() fs.FS {
if pc.clusterLoaderConfig.PrometheusConfig.ManifestPath != "" {
return os.DirFS(pc.clusterLoaderConfig.PrometheusConfig.ManifestPath)
}
return manifestsFS
}
// exposeAPIServerMetrics configures anonymous access to the apiserver metrics.
func (pc *Controller) exposeAPIServerMetrics() error {
klog.V(2).Info("Exposing kube-apiserver metrics in the cluster")
// We need to get a client to the cluster where the test is being executed on,
// not the cluster that the prometheus is running in. Usually, there is only
// once cluster, but in case of kubemark we have two and thus we need to
// create a new client here.
clientSet, err := framework.NewMultiClientSet(
pc.clusterLoaderConfig.ClusterConfig.KubeConfigPath, numK8sClients)
if err != nil {
return err
}
createClusterRole := func() error {
_, err := clientSet.GetClient().RbacV1().ClusterRoles().Create(context.TODO(), &rbacv1.ClusterRole{
ObjectMeta: metav1.ObjectMeta{Name: "apiserver-metrics-viewer"},
Rules: []rbacv1.PolicyRule{
{Verbs: []string{"get"}, NonResourceURLs: []string{"/metrics"}},
},
}, metav1.CreateOptions{})
return err
}
createClusterRoleBinding := func() error {
_, err := clientSet.GetClient().RbacV1().ClusterRoleBindings().Create(context.TODO(), &rbacv1.ClusterRoleBinding{
ObjectMeta: metav1.ObjectMeta{Name: "system:anonymous"},
RoleRef: rbacv1.RoleRef{Kind: "ClusterRole", Name: "apiserver-metrics-viewer"},
Subjects: []rbacv1.Subject{
{Kind: "User", Name: "system:anonymous"},
},
}, metav1.CreateOptions{})
return err
}
if err := retryCreateFunction(createClusterRole); err != nil {
return err
}
if err := retryCreateFunction(createClusterRoleBinding); err != nil {
return err
}
return nil
}
// runNodeExporter adds node-exporter as master's static manifest pod.
// TODO(mborsz): Consider migrating to something less ugly, e.g. daemonset-based approach,
// when master nodes have configured networking.
func (pc *Controller) runNodeExporter() error {
klog.V(2).Infof("Starting node-exporter on master nodes.")
kubemarkFramework, err := framework.NewFramework(&pc.clusterLoaderConfig.ClusterConfig, numK8sClients)
if err != nil {
return err
}
// Validate masters first
nodes, err := client.ListNodes(kubemarkFramework.GetClientSets().GetClient())
if err != nil {
return err
}
var g errgroup.Group
numMasters := 0
for _, node := range nodes {
node := node
if util.LegacyIsMasterNode(&node) || util.IsControlPlaneNode(&node) {
numMasters++
g.Go(func() error {
f, err := pc.manifestsFS().Open(nodeExporterPod)
if err != nil {
return fmt.Errorf("unable to open manifest file: %v", err)
}
defer f.Close()
return pc.ssh.Exec("sudo tee /etc/kubernetes/manifests/node-exporter.yaml > /dev/null", &node, f)
})
}
}
if numMasters == 0 {
return fmt.Errorf("node-exporter requires master to be registered nodes")
}
return g.Wait()
}
func (pc *Controller) waitForPrometheusToBeHealthy() error {
klog.V(2).Info("Waiting for Prometheus stack to become healthy...")
return wait.PollImmediate(
checkPrometheusReadyInterval,
pc.readyTimeout,
pc.isPrometheusReady)
}
func (pc *Controller) isPrometheusReady() (bool, error) {
// TODO(mm4tt): Re-enable kube-proxy monitoring and expect more targets.
// This is a safeguard from a race condition where the prometheus server is started before
// targets are registered. These 4 targets are always expected, in all possible configurations:
// prometheus, prometheus-operator, grafana, apiserver
expectedTargets := 4
if pc.clusterLoaderConfig.PrometheusConfig.ScrapeEtcd {
// If scraping etcd is enabled (or it's kubemark where we scrape etcd unconditionally) we need
// a bit more complicated logic to asses whether all targets are ready. Etcd metric port has
// changed in https://github.com/kubernetes/kubernetes/pull/77561, depending on the k8s version
// etcd metrics may be available at port 2379 xor 2382. We solve that by setting two etcd
// serviceMonitors one for 2379 and other for 2382 and expect that at least 1 of them should be healthy.
ok, err := CheckAllTargetsReady( // All non-etcd targets should be ready.
pc.framework.GetClientSets().GetClient(),
func(t Target) bool { return !isEtcdEndpoint(t.Labels["endpoint"]) },
expectedTargets)
if err != nil || !ok {
return ok, err
}
return CheckTargetsReady( // 1 out of 2 etcd targets should be ready.
pc.framework.GetClientSets().GetClient(),
func(t Target) bool { return isEtcdEndpoint(t.Labels["endpoint"]) },
2, // expected targets: etcd-2379 and etcd-2382
1) // one of them should be healthy
}
return CheckAllTargetsReady(
pc.framework.GetClientSets().GetClient(),
func(Target) bool { return true }, // All targets.
expectedTargets)
}
func retryCreateFunction(f func() error) error {
return client.RetryWithExponentialBackOff(
client.RetryFunction(f, client.Allow(apierrs.IsAlreadyExists)))
}
func (pc *Controller) isKubemark() bool {
return pc.provider.Features().IsKubemarkProvider
}
func dumpAdditionalLogsOnPrometheusSetupFailure(k8sClient kubernetes.Interface) {
klog.V(2).Info("Dumping monitoring/prometheus-k8s events...")
list, err := client.ListEvents(k8sClient, namespace, "prometheus-k8s")
if err != nil {
klog.Warningf("Error while listing monitoring/prometheus-k8s events: %v", err)
return
}
s, err := json.MarshalIndent(list, "" /*=prefix*/, " " /*=indent*/)
if err != nil {
klog.Warningf("Error while marshalling response %v: %v", list, err)
return
}
klog.V(2).Info(string(s))
}
func getMasterIps(clusterConfig config.ClusterConfig, usePublicIPs bool) ([]string, error) {
if usePublicIPs {
if len(clusterConfig.MasterIPs) == 0 {
return nil, fmt.Errorf("requested to use public IPs, however no publics IPs are provided")
}
return clusterConfig.MasterIPs, nil
}
if len(clusterConfig.MasterInternalIPs) != 0 {
klog.V(2).Infof("Using internal master ips (%s) to monitor master's components", clusterConfig.MasterInternalIPs)
return clusterConfig.MasterInternalIPs, nil
}
klog.V(1).Infof("Unable to determine master ips from flags or registered nodes. Will fallback to default/kubernetes service, which can be inaccurate in HA environments.")
ips, err := getMasterIpsFromKubernetesService(clusterConfig)
if err != nil {
klog.Warningf("Failed to translate default/kubernetes service to IP: %v", err)
return nil, fmt.Errorf("no ips are set, fallback to default/kubernetes service failed due to: %v", err)
}
klog.V(2).Infof("default/kubernetes service translated to: %v", ips)
return ips, nil
}
func getMasterIpsFromKubernetesService(clusterConfig config.ClusterConfig) ([]string, error) {
// This has to be done in the kubemark cluster, thus we need to create a new client.
clientSet, err := framework.NewMultiClientSet(clusterConfig.KubeConfigPath, numK8sClients)
if err != nil {
return nil, err
}
var endpoints *corev1.Endpoints
f := func() error {
var err error
endpoints, err = clientSet.GetClient().CoreV1().Endpoints("default").Get(context.TODO(), "kubernetes", metav1.GetOptions{})
return err
}
if err := client.RetryWithExponentialBackOff(client.RetryFunction(f)); err != nil {
return nil, err
}
var ips []string
for _, subnet := range endpoints.Subsets {
for _, address := range subnet.Addresses {
ips = append(ips, address.IP)
}
}
if len(ips) == 0 {
return nil, errors.New("no master ips available in default/kubernetes service")
}
return ips, nil
}
func isEtcdEndpoint(endpoint string) bool {
return endpoint == "etcd-2379" || endpoint == "etcd-2382"
}