Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

make alert metrics count type & add service heartbeat #1849

Merged
merged 2 commits into from
Oct 24, 2018
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
67 changes: 48 additions & 19 deletions bootstrap/cmd/bootstrap/app/ksServer.go
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,28 @@ type ApplyRequest struct {
}

var (
// Counter metrics
deployReqCounter = prometheus.NewCounter(prometheus.CounterOpts{
Name: "deploy_requests",
Help: "Number of requests for deployments",
})
kfDeploymentsDoneCounter = prometheus.NewCounter(prometheus.CounterOpts{
Name: "kubeflow_deployments_done",
Help: "Number of successfully finished Kubeflow deployments",
})
invalidRequest = prometheus.NewCounter(prometheus.CounterOpts{
Name: "invalid_requests",
Help: "Number of invalid deploy request",
})
deploymentFailure = prometheus.NewCounter(prometheus.CounterOpts{
Name: "deployments_failure",
Help: "Number of failed Kubeflow deployments",
})
serviceHeartbeat = prometheus.NewCounter(prometheus.CounterOpts{
Name: "service_heartbeat",
Help: "Heartbeat signal every 10 seconds indicating pods are alive.",
})

// Gauge metrics
deployReqCounterRaw = prometheus.NewGauge(prometheus.GaugeOpts{
Name: "deploy_requests_raw",
Expand All @@ -200,14 +222,7 @@ var (
Name: "kubeflow_deployments_done_raw",
Help: "Number of successfully finished Kubeflow deployments",
})
InvalidRequestRaw = prometheus.NewGauge(prometheus.GaugeOpts{
Name: "invalid_request_raw",
Help: "Number of invalid deploy request",
})
DeploymentFailureRaw = prometheus.NewGauge(prometheus.GaugeOpts{
Name: "kubeflow_deployments_failure_raw",
Help: "Number of failed Kubeflow deployments",
})


// latencies
clusterDeploymentLatencies = prometheus.NewHistogram(prometheus.HistogramOpts{
Expand All @@ -224,13 +239,16 @@ var (

func init() {
// Register prometheus counters
prometheus.MustRegister(deployReqCounter)
prometheus.MustRegister(kfDeploymentsDoneCounter)
prometheus.MustRegister(clusterDeploymentLatencies)
prometheus.MustRegister(kfDeploymentLatencies)
prometheus.MustRegister(deployReqCounterRaw)
prometheus.MustRegister(clusterDeploymentsDoneRaw)
prometheus.MustRegister(kfDeploymentsDoneRaw)
prometheus.MustRegister(InvalidRequestRaw)
prometheus.MustRegister(DeploymentFailureRaw)
prometheus.MustRegister(invalidRequest)
prometheus.MustRegister(deploymentFailure)
prometheus.MustRegister(serviceHeartbeat)
}

func setupNamespace(namespaces type_v1.NamespaceInterface, name_space string) error {
Expand Down Expand Up @@ -882,7 +900,7 @@ func finishDeployment(svc KsService, req CreateRequest) {
status, err = svc.GetDeploymentStatus(ctx, req)
if err != nil {
log.Errorf("Failed to get deployment status: %v", err)
DeploymentFailureRaw.Inc()
deploymentFailure.Inc()
return
}
if status == "DONE" {
Expand All @@ -896,7 +914,7 @@ func finishDeployment(svc KsService, req CreateRequest) {
}
if status != "DONE" {
log.Errorf("Deployment status is not done: %v", status)
DeploymentFailureRaw.Inc()
deploymentFailure.Inc()
return
}

Expand All @@ -910,7 +928,7 @@ func finishDeployment(svc KsService, req CreateRequest) {
})
if err != nil {
log.Errorf("Failed to update IAM: %v", err)
DeploymentFailureRaw.Inc()
deploymentFailure.Inc()
return
}

Expand All @@ -924,15 +942,15 @@ func finishDeployment(svc KsService, req CreateRequest) {
})
if err != nil {
log.Errorf("Failed to insert service account key: %v", err)
DeploymentFailureRaw.Inc()
deploymentFailure.Inc()
return
}

log.Infof("Creating app...")
err = svc.CreateApp(ctx, req)
if err != nil {
log.Errorf("Failed to create app: %v", err)
DeploymentFailureRaw.Inc()
deploymentFailure.Inc()
return
}

Expand All @@ -954,32 +972,42 @@ func finishDeployment(svc KsService, req CreateRequest) {
})
if err != nil {
log.Errorf("Failed to apply app: %v", err)
DeploymentFailureRaw.Inc()
deploymentFailure.Inc()
return
}
}
kfDeploymentsDoneCounter.Inc()
kfDeploymentsDoneRaw.Inc()
kfDeploymentLatencies.Observe(timeSinceStart(ctx).Seconds())
}

// Add heartbeat every 10 seconds
func countHeartbeat() {
for {
time.Sleep(10 * time.Second)
serviceHeartbeat.Inc()
}
}

func makeDeployEndpoint(svc KsService) endpoint.Endpoint {
return func(ctx context.Context, request interface{}) (interface{}, error) {
req := request.(CreateRequest)
r := &basicServerResponse{}
deployReqCounter.Inc()
deployReqCounterRaw.Inc()

dmServiceAccount := req.ProjectNumber + "@cloudservices.gserviceaccount.com"
err := svc.BindRole(ctx, req.Project, req.Token, dmServiceAccount)
if err != nil {
r.Err = err.Error()
DeploymentFailureRaw.Inc()
deploymentFailure.Inc()
return r, err
}

err = svc.InsertDeployment(ctx, req)
if err != nil {
r.Err = err.Error()
DeploymentFailureRaw.Inc()
deploymentFailure.Inc()
return r, err
}
go finishDeployment(svc, req)
Expand Down Expand Up @@ -1022,7 +1050,7 @@ func makeIamEndpoint(svc KsService) endpoint.Endpoint {
func decodeCreateAppRequest(_ context.Context, r *http.Request) (interface{}, error) {
var request CreateRequest
if err := json.NewDecoder(r.Body).Decode(&request); err != nil {
InvalidRequestRaw.Inc()
invalidRequest.Inc()
return nil, err
}
return request, nil
Expand Down Expand Up @@ -1137,5 +1165,6 @@ func (s *ksServer) StartHttp(port int) {
// add an http handler for prometheus metrics
http.Handle("/metrics", promhttp.Handler())

go countHeartbeat()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: you could probably inline this one as

go func() {
   time.Sleep(10 * time.Second)
   serviceHeartbeat.Inc()
}()

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, added the labels too soon...

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It works the same either way :)

log.Fatal(http.ListenAndServe(fmt.Sprintf(":%d", port), nil))
}