Skip to content

Commit

Permalink
add status tag to deployments_failure; exclude failure caused by quot…
Browse files Browse the repository at this point in the history
…a or permission issue (kubeflow#2215)

* add status tag to deployments_failure; exclude failure caused by quota or permission issue

* address comments
  • Loading branch information
kunmingg authored and Kam D Kasravi committed Feb 8, 2019
1 parent cd99bc1 commit 670d488
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 16 deletions.
19 changes: 12 additions & 7 deletions bootstrap/cmd/bootstrap/app/gcpUtils.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,13 +76,13 @@ func (s *ksServer) InsertDeployment(ctx context.Context, req CreateRequest) (*de
confByte, err := yaml.Marshal(dmconf)
if err != nil {
deployReqCounter.WithLabelValues("INTERNAL").Inc()
deploymentFailure.Inc()
deploymentFailure.WithLabelValues("INTERNAL").Inc()
return nil, err
}
templateData, err := ioutil.ReadFile(path.Join(regPath, "../deployment/gke/deployment_manager_configs/cluster.jinja"))
if err != nil {
deployReqCounter.WithLabelValues("INTERNAL").Inc()
deploymentFailure.Inc()
deploymentFailure.WithLabelValues("INTERNAL").Inc()
return nil, err
}
ts := oauth2.StaticTokenSource(&oauth2.Token{
Expand All @@ -91,7 +91,7 @@ func (s *ksServer) InsertDeployment(ctx context.Context, req CreateRequest) (*de
deploymentmanagerService, err := deploymentmanager.New(oauth2.NewClient(ctx, ts))
if err != nil {
deployReqCounter.WithLabelValues("INTERNAL").Inc()
deploymentFailure.Inc()
deploymentFailure.WithLabelValues("INTERNAL").Inc()
return nil, err
}
rb := &deploymentmanager.Deployment{
Expand All @@ -118,19 +118,24 @@ func (s *ksServer) InsertDeployment(ctx context.Context, req CreateRequest) (*de
return rb, nil
}

func (s *ksServer) GetDeploymentStatus(ctx context.Context, req CreateRequest) (string, error) {
func (s *ksServer) GetDeploymentStatus(ctx context.Context, req CreateRequest) (string, string, error) {
ts := oauth2.StaticTokenSource(&oauth2.Token{
AccessToken: req.Token,
})
deploymentmanagerService, err := deploymentmanager.New(oauth2.NewClient(ctx, ts))
if err != nil {
return "", err
return "", "", err
}
dm, err := deploymentmanagerService.Deployments.Get(req.Project, req.Name).Context(ctx).Do()
if err != nil {
return "", err
return "", "", err
}
return dm.Operation.Status, nil
if dm.Operation.Status == "DONE" {
if dm.Operation.Error != nil && len(dm.Operation.Error.Errors) > 0 {
return dm.Operation.Status, dm.Operation.Error.Errors[0].Message, nil
}
}
return dm.Operation.Status, "", nil
}

// Clear existing bindings for auto-generated service accounts of current deployment.
Expand Down
39 changes: 30 additions & 9 deletions bootstrap/cmd/bootstrap/app/ksServer.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ type KsService interface {
ConfigCluster(context.Context, CreateRequest) error
BindRole(context.Context, string, string, string) error
InsertDeployment(context.Context, CreateRequest) (*deploymentmanager.Deployment, error)
GetDeploymentStatus(context.Context, CreateRequest) (string, error)
GetDeploymentStatus(context.Context, CreateRequest) (string, string, error)
ApplyIamPolicy(context.Context, ApplyIamRequest) error
GetProjectLock(string) *sync.Mutex
}
Expand Down Expand Up @@ -242,15 +242,25 @@ var (
},
[]string{"status"},
)
deploymentFailure = prometheus.NewCounter(prometheus.CounterOpts{
deploymentFailure = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "deployments_failure",
Help: "Number of failed Kubeflow deployments",
})
}, []string{"status"})

serviceHeartbeat = prometheus.NewCounter(prometheus.CounterOpts{
Name: "service_heartbeat",
Help: "Heartbeat signal every 10 seconds indicating pods are alive.",
})

deployReqCounterUser = prometheus.NewCounter(prometheus.CounterOpts{
Name: "deploy_requests_user",
Help: "Number of user requests for deployments",
})
kfDeploymentsDoneUser = prometheus.NewCounter(prometheus.CounterOpts{
Name: "kubeflow_deployments_done_user",
Help: "Number of successfully finished Kubeflow user deployments",
})

// Gauge metrics
deployReqCounterRaw = prometheus.NewGauge(prometheus.GaugeOpts{
Name: "deploy_requests_raw",
Expand Down Expand Up @@ -279,6 +289,8 @@ func init() {
prometheus.MustRegister(deployReqCounter)
prometheus.MustRegister(clusterDeploymentLatencies)
prometheus.MustRegister(kfDeploymentLatencies)
prometheus.MustRegister(deployReqCounterUser)
prometheus.MustRegister(kfDeploymentsDoneUser)
prometheus.MustRegister(deployReqCounterRaw)
prometheus.MustRegister(kfDeploymentsDoneRaw)
prometheus.MustRegister(deploymentFailure)
Expand Down Expand Up @@ -1082,14 +1094,21 @@ func finishDeployment(svc KsService, req CreateRequest, dmDeploy *deploymentmana
ctx = context.WithValue(ctx, StartTime, time.Now())
for retry := 0; retry < 60; retry++ {
time.Sleep(10 * time.Second)
status, err = svc.GetDeploymentStatus(ctx, req)
status, errMsg, err := svc.GetDeploymentStatus(ctx, req)
if err != nil {
log.Errorf("Failed to get deployment status: %v", err)
deployReqCounter.WithLabelValues("INTERNAL").Inc()
deploymentFailure.Inc()
deploymentFailure.WithLabelValues("INTERNAL").Inc()
return
}
if status == "DONE" {
if errMsg != "" {
log.Errorf("Deployment manager returned error message: %v", errMsg)
// Mark status "INVALID_ARGUMENT" as most deployment manager failures are caused by insufficient quota or permission.
// Error messages are available from UI, and should be resolvable by retries.
deployReqCounter.WithLabelValues("INVALID_ARGUMENT").Inc()
return
}
clusterDeploymentLatencies.Observe(timeSinceStart(ctx).Seconds())
log.Infof("Deployment is done")
break
Expand All @@ -1099,7 +1118,7 @@ func finishDeployment(svc KsService, req CreateRequest, dmDeploy *deploymentmana
if status != "DONE" {
log.Errorf("Deployment status is not done: %v", status)
deployReqCounter.WithLabelValues("INTERNAL").Inc()
deploymentFailure.Inc()
deploymentFailure.WithLabelValues("INTERNAL").Inc()
return
}

Expand All @@ -1114,14 +1133,14 @@ func finishDeployment(svc KsService, req CreateRequest, dmDeploy *deploymentmana
if err != nil {
log.Errorf("Failed to update IAM: %v", err)
deployReqCounter.WithLabelValues("INTERNAL").Inc()
deploymentFailure.Inc()
deploymentFailure.WithLabelValues("INTERNAL").Inc()
return
}

log.Infof("Configuring cluster...")
if err = svc.ConfigCluster(ctx, req); err != nil {
deployReqCounter.WithLabelValues("INTERNAL").Inc()
deploymentFailure.Inc()
deploymentFailure.WithLabelValues("INTERNAL").Inc()
return
}

Expand All @@ -1130,13 +1149,14 @@ func finishDeployment(svc KsService, req CreateRequest, dmDeploy *deploymentmana
if err != nil {
log.Errorf("Failed to create app: %v", err)
deployReqCounter.WithLabelValues("INTERNAL").Inc()
deploymentFailure.Inc()
deploymentFailure.WithLabelValues("INTERNAL").Inc()
return
}

deployReqCounter.WithLabelValues("OK").Inc()
if req.Project != "kubeflow-prober-deploy" {
kfDeploymentsDoneRaw.Inc()
kfDeploymentsDoneUser.Inc()
}
kfDeploymentLatencies.Observe(timeSinceStart(ctx).Seconds())
}
Expand All @@ -1155,6 +1175,7 @@ func makeDeployEndpoint(svc KsService) endpoint.Endpoint {
r := &basicServerResponse{}
if req.Project != "kubeflow-prober-deploy" {
deployReqCounterRaw.Inc()
deployReqCounterUser.Inc()
}
if err := req.Validate(); err != nil {
r.Err = err.Error()
Expand Down

0 comments on commit 670d488

Please sign in to comment.