From 9f8a60afc2d4b37840c201762c2e583948ccf9d4 Mon Sep 17 00:00:00 2001
From: Lili Cosic <cosiclili@gmail.com>
Date: Thu, 30 Apr 2020 15:50:33 +0200
Subject: [PATCH] WIP: Set degraded OperatorStatus based on critical alerts
 firing in

monitoring stack

TODO: severity=critical
---
 pkg/operator/operator.go |  47 ++++++++++++----
 pkg/status/status.go     | 114 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 150 insertions(+), 11 deletions(-)
 create mode 100644 pkg/status/status.go

diff --git a/pkg/operator/operator.go b/pkg/operator/operator.go
index 407e55d5de..253329c04e 100644
--- a/pkg/operator/operator.go
+++ b/pkg/operator/operator.go
@@ -15,7 +15,7 @@
 package operator
 
 import (
-	"strings"
+	"fmt"
 	"time"
 
 	configv1 "github.com/openshift/api/config/v1"
@@ -31,6 +31,7 @@ import (
 
 	"github.com/openshift/cluster-monitoring-operator/pkg/client"
 	"github.com/openshift/cluster-monitoring-operator/pkg/manifests"
+	"github.com/openshift/cluster-monitoring-operator/pkg/status"
 	"github.com/openshift/cluster-monitoring-operator/pkg/tasks"
 )
 
@@ -55,6 +56,7 @@ type Operator struct {
 	telemetryMatches []string
 	remoteWrite      bool
 
+	config *rest.Config
 	client *client.Client
 
 	cmapInf                       cache.SharedIndexInformer
@@ -83,6 +85,7 @@ func New(config *rest.Config, version, namespace, namespaceUserWorkload, namespa
 		namespace:             namespace,
 		namespaceUserWorkload: namespaceUserWorkload,
 		client:                c,
+		config:                config,
 		queue:                 workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "cluster-monitoring"),
 	}
 
@@ -335,21 +338,43 @@ func (o *Operator) sync(key string) error {
 		klog.Errorf("error occurred while setting status to in progress: %v", err)
 	}
 
-	taskName, err := tl.RunAll()
+	_, err = tl.RunAll()
 	if err != nil {
-		klog.Infof("Updating ClusterOperator status to failed. Err: %v", err)
-		failedTaskReason := strings.Join(strings.Fields(taskName+"Failed"), "")
-		reportErr := o.client.StatusReporter().SetFailed(err, failedTaskReason)
-		if reportErr != nil {
-			klog.Errorf("error occurred while setting status to failed: %v", reportErr)
+		// TODO: instrument the tasks better:
+		// - set clear msg what failed
+		// - set SLO for tasks that failed
+		// - e.g. SLO alert for cluster-monitoring-operator tasks
+		degraded, msg, err := status.IsDegraded(o.config)
+		if err != nil {
+			fmt.Println(err)
+		}
+		if degraded {
+			fmt.Println("alert?")
+			failedError := errors.New("monitoring stack alert is degraded, we are breaching our SLO - alerts for our important components are firing")
+			reportErr := o.client.StatusReporter().SetFailed(failedError, msg)
+			if reportErr != nil {
+				klog.Errorf("error occurred while setting status to failed: %v", reportErr)
+			}
 		}
 		return err
 	}
-
-	klog.Info("Updating ClusterOperator status to done.")
-	err = o.client.StatusReporter().SetDone()
+	degraded, msg, err := status.IsDegraded(o.config)
 	if err != nil {
-		klog.Errorf("error occurred while setting status to done: %v", err)
+		fmt.Println(err)
+	}
+	if degraded {
+		fmt.Println("alert?")
+		failedError := errors.New("monitoring stack alert is firing, we are breaching our SLA")
+		reportErr := o.client.StatusReporter().SetFailed(failedError, msg)
+		if reportErr != nil {
+			klog.Errorf("error occurred while setting status to failed: %v", reportErr)
+		}
+	} else if !degraded {
+		klog.Info("Updating ClusterOperator status to done.")
+		err = o.client.StatusReporter().SetDone()
+		if err != nil {
+			klog.Errorf("error occurred while setting status to done: %v", err)
+		}
 	}
 
 	return nil
diff --git a/pkg/status/status.go b/pkg/status/status.go
new file mode 100644
index 0000000000..5af5d106ad
--- /dev/null
+++ b/pkg/status/status.go
@@ -0,0 +1,114 @@
+// Copyright 2019 The Cluster Monitoring Operator Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package status
+
+import (
+	"fmt"
+	"strings"
+
+	"github.com/Jeffail/gabs"
+	routev1 "github.com/openshift/client-go/route/clientset/versioned/typed/route/v1"
+	"github.com/openshift/cluster-monitoring-operator/test/e2e/framework"
+	"github.com/pkg/errors"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/client-go/kubernetes"
+	"k8s.io/client-go/rest"
+)
+
+// IsDegraded returns true when there are any critical alerts firing in openshift-monitoring
+// or openshift-user-workload-monitoring namespaces. Or when quering of alerts fails as that
+// implies a problem somewhere in the cluster.
+func IsDegraded(config *rest.Config) (bool, string, error) {
+	firing, msg, err := alertsFiring(config)
+	if err != nil {
+		return true, msg, errors.Wrap(err, "could not query for alerts firing")
+	}
+	if firing {
+		return true, "MonitoringAlertsFiring", errors.New("alerts around monitoring stack are firing")
+	}
+	return false, "", nil
+}
+
+func alertsFiring(config *rest.Config) (bool, string, error) {
+	// Prometheus client depends on setup above.
+	// So far only necessary for prometheusK8sClient.
+	openshiftRouteClient, err := routev1.NewForConfig(config)
+	if err != nil {
+		return false, "OpenShiftRouteClientError", errors.Wrap(err, "creating openshiftRouteClient failed")
+	}
+	kubeClient, err := kubernetes.NewForConfig(config)
+	if err != nil {
+		return false, "KubeConfigError", errors.Wrap(err, "creating kubeClient failed")
+	}
+	token, err := getServiceAccountToken(kubeClient, "openshift-monitoring", "cluster-monitoring-operator")
+	if err != nil {
+		return false, "ServiceAccountTokenMissing", err
+	}
+	thanosQuerierClient, err := framework.NewPrometheusClientFromRoute(
+		openshiftRouteClient,
+		"openshift-monitoring", "thanos-querier",
+		token,
+	)
+	if err != nil {
+		return false, "ThanosQuerierClientError", errors.Wrap(err, "creating ThanosQuerierClient failed")
+	}
+	// TODO: replace with actual alerts that we care about
+	// critical in openshift-monitoring and openshift-user-workload-monitoring
+	// if user workload is enabled then check that namespace as well
+	// Any critical monitoring alert
+	body, err := thanosQuerierClient.PrometheusQuery(`ALERTS{namespace="openshift-monitoring", severity="critical"}`)
+	if err != nil {
+		fmt.Println(err)
+		return false, "ThanosQuerierQueryFailed", err
+	}
+
+	res, err := gabs.ParseJSON(body)
+	if err != nil {
+		fmt.Println(err)
+		return false, "", err
+	}
+
+	count, err := res.ArrayCountP("data.result")
+	if err != nil {
+		fmt.Println(err)
+		return false, "", err
+	}
+
+	if count > 0 {
+		fmt.Println(res)
+		fmt.Println("----what")
+		return true, "AlertsFiring", nil
+	}
+
+	return false, "", nil
+}
+
+func getServiceAccountToken(kubeClient *kubernetes.Clientset, namespace, name string) (string, error) {
+	secrets, err := kubeClient.CoreV1().Secrets(namespace).List(metav1.ListOptions{})
+	if err != nil {
+		return "", err
+	}
+	for _, secret := range secrets.Items {
+		_, dockerToken := secret.Annotations["openshift.io/create-dockercfg-secrets"]
+		token := strings.Contains(secret.Name, fmt.Sprintf("%s-token-", name))
+
+		// we have to skip the token secret that contains the openshift.io/create-dockercfg-secrets annotation
+		// as this is the token to talk to the internal registry.
+		if !dockerToken && token {
+			return string(secret.Data["token"]), nil
+		}
+	}
+	return "", errors.Errorf("cannot find token for %s/%s service account", namespace, name)
+}