Skip to content

Commit

Permalink
make alert metrics count type & add service heartbeat (#1849)
Browse files Browse the repository at this point in the history
* siwtch alert metrics to count type; add service heartbeat signal

* rename vars
  • Loading branch information
kunmingg authored and k8s-ci-robot committed Oct 24, 2018
1 parent 430117c commit e4a5d84
Showing 1 changed file with 48 additions and 19 deletions.
67 changes: 48 additions & 19 deletions bootstrap/cmd/bootstrap/app/ksServer.go
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,28 @@ type ApplyRequest struct {
}

var (
// Counter metrics
deployReqCounter = prometheus.NewCounter(prometheus.CounterOpts{
Name: "deploy_requests",
Help: "Number of requests for deployments",
})
kfDeploymentsDoneCounter = prometheus.NewCounter(prometheus.CounterOpts{
Name: "kubeflow_deployments_done",
Help: "Number of successfully finished Kubeflow deployments",
})
invalidRequest = prometheus.NewCounter(prometheus.CounterOpts{
Name: "invalid_requests",
Help: "Number of invalid deploy request",
})
deploymentFailure = prometheus.NewCounter(prometheus.CounterOpts{
Name: "deployments_failure",
Help: "Number of failed Kubeflow deployments",
})
serviceHeartbeat = prometheus.NewCounter(prometheus.CounterOpts{
Name: "service_heartbeat",
Help: "Heartbeat signal every 10 seconds indicating pods are alive.",
})

// Gauge metrics
deployReqCounterRaw = prometheus.NewGauge(prometheus.GaugeOpts{
Name: "deploy_requests_raw",
Expand All @@ -200,14 +222,7 @@ var (
Name: "kubeflow_deployments_done_raw",
Help: "Number of successfully finished Kubeflow deployments",
})
InvalidRequestRaw = prometheus.NewGauge(prometheus.GaugeOpts{
Name: "invalid_request_raw",
Help: "Number of invalid deploy request",
})
DeploymentFailureRaw = prometheus.NewGauge(prometheus.GaugeOpts{
Name: "kubeflow_deployments_failure_raw",
Help: "Number of failed Kubeflow deployments",
})


// latencies
clusterDeploymentLatencies = prometheus.NewHistogram(prometheus.HistogramOpts{
Expand All @@ -224,13 +239,16 @@ var (

func init() {
// Register prometheus counters
prometheus.MustRegister(deployReqCounter)
prometheus.MustRegister(kfDeploymentsDoneCounter)
prometheus.MustRegister(clusterDeploymentLatencies)
prometheus.MustRegister(kfDeploymentLatencies)
prometheus.MustRegister(deployReqCounterRaw)
prometheus.MustRegister(clusterDeploymentsDoneRaw)
prometheus.MustRegister(kfDeploymentsDoneRaw)
prometheus.MustRegister(InvalidRequestRaw)
prometheus.MustRegister(DeploymentFailureRaw)
prometheus.MustRegister(invalidRequest)
prometheus.MustRegister(deploymentFailure)
prometheus.MustRegister(serviceHeartbeat)
}

func setupNamespace(namespaces type_v1.NamespaceInterface, name_space string) error {
Expand Down Expand Up @@ -882,7 +900,7 @@ func finishDeployment(svc KsService, req CreateRequest) {
status, err = svc.GetDeploymentStatus(ctx, req)
if err != nil {
log.Errorf("Failed to get deployment status: %v", err)
DeploymentFailureRaw.Inc()
deploymentFailure.Inc()
return
}
if status == "DONE" {
Expand All @@ -896,7 +914,7 @@ func finishDeployment(svc KsService, req CreateRequest) {
}
if status != "DONE" {
log.Errorf("Deployment status is not done: %v", status)
DeploymentFailureRaw.Inc()
deploymentFailure.Inc()
return
}

Expand All @@ -910,7 +928,7 @@ func finishDeployment(svc KsService, req CreateRequest) {
})
if err != nil {
log.Errorf("Failed to update IAM: %v", err)
DeploymentFailureRaw.Inc()
deploymentFailure.Inc()
return
}

Expand All @@ -924,15 +942,15 @@ func finishDeployment(svc KsService, req CreateRequest) {
})
if err != nil {
log.Errorf("Failed to insert service account key: %v", err)
DeploymentFailureRaw.Inc()
deploymentFailure.Inc()
return
}

log.Infof("Creating app...")
err = svc.CreateApp(ctx, req)
if err != nil {
log.Errorf("Failed to create app: %v", err)
DeploymentFailureRaw.Inc()
deploymentFailure.Inc()
return
}

Expand All @@ -954,32 +972,42 @@ func finishDeployment(svc KsService, req CreateRequest) {
})
if err != nil {
log.Errorf("Failed to apply app: %v", err)
DeploymentFailureRaw.Inc()
deploymentFailure.Inc()
return
}
}
kfDeploymentsDoneCounter.Inc()
kfDeploymentsDoneRaw.Inc()
kfDeploymentLatencies.Observe(timeSinceStart(ctx).Seconds())
}

// Add heartbeat every 10 seconds
func countHeartbeat() {
for {
time.Sleep(10 * time.Second)
serviceHeartbeat.Inc()
}
}

func makeDeployEndpoint(svc KsService) endpoint.Endpoint {
return func(ctx context.Context, request interface{}) (interface{}, error) {
req := request.(CreateRequest)
r := &basicServerResponse{}
deployReqCounter.Inc()
deployReqCounterRaw.Inc()

dmServiceAccount := req.ProjectNumber + "@cloudservices.gserviceaccount.com"
err := svc.BindRole(ctx, req.Project, req.Token, dmServiceAccount)
if err != nil {
r.Err = err.Error()
DeploymentFailureRaw.Inc()
deploymentFailure.Inc()
return r, err
}

err = svc.InsertDeployment(ctx, req)
if err != nil {
r.Err = err.Error()
DeploymentFailureRaw.Inc()
deploymentFailure.Inc()
return r, err
}
go finishDeployment(svc, req)
Expand Down Expand Up @@ -1022,7 +1050,7 @@ func makeIamEndpoint(svc KsService) endpoint.Endpoint {
func decodeCreateAppRequest(_ context.Context, r *http.Request) (interface{}, error) {
var request CreateRequest
if err := json.NewDecoder(r.Body).Decode(&request); err != nil {
InvalidRequestRaw.Inc()
invalidRequest.Inc()
return nil, err
}
return request, nil
Expand Down Expand Up @@ -1137,5 +1165,6 @@ func (s *ksServer) StartHttp(port int) {
// add an http handler for prometheus metrics
http.Handle("/metrics", promhttp.Handler())

go countHeartbeat()
log.Fatal(http.ListenAndServe(fmt.Sprintf(":%d", port), nil))
}

0 comments on commit e4a5d84

Please sign in to comment.