From 9f8a60afc2d4b37840c201762c2e583948ccf9d4 Mon Sep 17 00:00:00 2001 From: Lili Cosic Date: Thu, 30 Apr 2020 15:50:33 +0200 Subject: [PATCH] WIP: Set degraded OperatorStatus based on critical alerts firing in monitoring stack TODO: severity=critical --- pkg/operator/operator.go | 47 ++++++++++++---- pkg/status/status.go | 114 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 150 insertions(+), 11 deletions(-) create mode 100644 pkg/status/status.go diff --git a/pkg/operator/operator.go b/pkg/operator/operator.go index 407e55d5de..253329c04e 100644 --- a/pkg/operator/operator.go +++ b/pkg/operator/operator.go @@ -15,7 +15,7 @@ package operator import ( - "strings" + "fmt" "time" configv1 "github.com/openshift/api/config/v1" @@ -31,6 +31,7 @@ import ( "github.com/openshift/cluster-monitoring-operator/pkg/client" "github.com/openshift/cluster-monitoring-operator/pkg/manifests" + "github.com/openshift/cluster-monitoring-operator/pkg/status" "github.com/openshift/cluster-monitoring-operator/pkg/tasks" ) @@ -55,6 +56,7 @@ type Operator struct { telemetryMatches []string remoteWrite bool + config *rest.Config client *client.Client cmapInf cache.SharedIndexInformer @@ -83,6 +85,7 @@ func New(config *rest.Config, version, namespace, namespaceUserWorkload, namespa namespace: namespace, namespaceUserWorkload: namespaceUserWorkload, client: c, + config: config, queue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "cluster-monitoring"), } @@ -335,21 +338,43 @@ func (o *Operator) sync(key string) error { klog.Errorf("error occurred while setting status to in progress: %v", err) } - taskName, err := tl.RunAll() + _, err = tl.RunAll() if err != nil { - klog.Infof("Updating ClusterOperator status to failed. Err: %v", err) - failedTaskReason := strings.Join(strings.Fields(taskName+"Failed"), "") - reportErr := o.client.StatusReporter().SetFailed(err, failedTaskReason) - if reportErr != nil { - klog.Errorf("error occurred while setting status to failed: %v", reportErr) + // TODO: instrument the tasks better: + // - set clear msg what failed + // - set SLO for tasks that failed + // - e.g. SLO alert for cluster-monitoring-operator tasks + degraded, msg, err := status.IsDegraded(o.config) + if err != nil { + fmt.Println(err) + } + if degraded { + fmt.Println("alert?") + failedError := errors.New("monitoring stack alert is degraded, we are breaching our SLO - alerts for our important components are firing") + reportErr := o.client.StatusReporter().SetFailed(failedError, msg) + if reportErr != nil { + klog.Errorf("error occurred while setting status to failed: %v", reportErr) + } } return err } - - klog.Info("Updating ClusterOperator status to done.") - err = o.client.StatusReporter().SetDone() + degraded, msg, err := status.IsDegraded(o.config) if err != nil { - klog.Errorf("error occurred while setting status to done: %v", err) + fmt.Println(err) + } + if degraded { + fmt.Println("alert?") + failedError := errors.New("monitoring stack alert is firing, we are breaching our SLA") + reportErr := o.client.StatusReporter().SetFailed(failedError, msg) + if reportErr != nil { + klog.Errorf("error occurred while setting status to failed: %v", reportErr) + } + } else if !degraded { + klog.Info("Updating ClusterOperator status to done.") + err = o.client.StatusReporter().SetDone() + if err != nil { + klog.Errorf("error occurred while setting status to done: %v", err) + } } return nil diff --git a/pkg/status/status.go b/pkg/status/status.go new file mode 100644 index 0000000000..5af5d106ad --- /dev/null +++ b/pkg/status/status.go @@ -0,0 +1,114 @@ +// Copyright 2019 The Cluster Monitoring Operator Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package status + +import ( + "fmt" + "strings" + + "github.com/Jeffail/gabs" + routev1 "github.com/openshift/client-go/route/clientset/versioned/typed/route/v1" + "github.com/openshift/cluster-monitoring-operator/test/e2e/framework" + "github.com/pkg/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" +) + +// IsDegraded returns true when there are any critical alerts firing in openshift-monitoring +// or openshift-user-workload-monitoring namespaces. Or when quering of alerts fails as that +// implies a problem somewhere in the cluster. +func IsDegraded(config *rest.Config) (bool, string, error) { + firing, msg, err := alertsFiring(config) + if err != nil { + return true, msg, errors.Wrap(err, "could not query for alerts firing") + } + if firing { + return true, "MonitoringAlertsFiring", errors.New("alerts around monitoring stack are firing") + } + return false, "", nil +} + +func alertsFiring(config *rest.Config) (bool, string, error) { + // Prometheus client depends on setup above. + // So far only necessary for prometheusK8sClient. + openshiftRouteClient, err := routev1.NewForConfig(config) + if err != nil { + return false, "OpenShiftRouteClientError", errors.Wrap(err, "creating openshiftRouteClient failed") + } + kubeClient, err := kubernetes.NewForConfig(config) + if err != nil { + return false, "KubeConfigError", errors.Wrap(err, "creating kubeClient failed") + } + token, err := getServiceAccountToken(kubeClient, "openshift-monitoring", "cluster-monitoring-operator") + if err != nil { + return false, "ServiceAccountTokenMissing", err + } + thanosQuerierClient, err := framework.NewPrometheusClientFromRoute( + openshiftRouteClient, + "openshift-monitoring", "thanos-querier", + token, + ) + if err != nil { + return false, "ThanosQuerierClientError", errors.Wrap(err, "creating ThanosQuerierClient failed") + } + // TODO: replace with actual alerts that we care about + // critical in openshift-monitoring and openshift-user-workload-monitoring + // if user workload is enabled then check that namespace as well + // Any critical monitoring alert + body, err := thanosQuerierClient.PrometheusQuery(`ALERTS{namespace="openshift-monitoring", severity="critical"}`) + if err != nil { + fmt.Println(err) + return false, "ThanosQuerierQueryFailed", err + } + + res, err := gabs.ParseJSON(body) + if err != nil { + fmt.Println(err) + return false, "", err + } + + count, err := res.ArrayCountP("data.result") + if err != nil { + fmt.Println(err) + return false, "", err + } + + if count > 0 { + fmt.Println(res) + fmt.Println("----what") + return true, "AlertsFiring", nil + } + + return false, "", nil +} + +func getServiceAccountToken(kubeClient *kubernetes.Clientset, namespace, name string) (string, error) { + secrets, err := kubeClient.CoreV1().Secrets(namespace).List(metav1.ListOptions{}) + if err != nil { + return "", err + } + for _, secret := range secrets.Items { + _, dockerToken := secret.Annotations["openshift.io/create-dockercfg-secrets"] + token := strings.Contains(secret.Name, fmt.Sprintf("%s-token-", name)) + + // we have to skip the token secret that contains the openshift.io/create-dockercfg-secrets annotation + // as this is the token to talk to the internal registry. + if !dockerToken && token { + return string(secret.Data["token"]), nil + } + } + return "", errors.Errorf("cannot find token for %s/%s service account", namespace, name) +}