Skip to content

Commit

Permalink
Add metrics to all major gce operations {latency, errors}
Browse files Browse the repository at this point in the history
The new metrics is:

  cloudprovider_gce_api_duration_seconds{request, region, zone}
  cloudprovider_gce_api_errors{request, region, zone}

`request` is the specific function that is used.
`region` is the target region (Will be "<n/a>" if not applicable)
`zone` is the target zone (Will be "<n/a>" if not applicable)

Note: this fixes some issues with the previous implementation of
metrics for disks:
- Time duration tracked was of the initial API call, not the entire
  operation.
- Metrics label tuple would have resulted in many independent
  histograms stored, one for each disk. (Did not aggregate well).
  • Loading branch information
bowei committed Apr 26, 2017
1 parent 274df99 commit 3f16302
Show file tree
Hide file tree
Showing 18 changed files with 526 additions and 272 deletions.
4 changes: 1 addition & 3 deletions pkg/cloudprovider/providers/gce/BUILD
Expand Up @@ -23,14 +23,14 @@ go_library(
"gce_instancegroup.go",
"gce_instances.go",
"gce_loadbalancer.go",
"gce_metrics.go",
"gce_op.go",
"gce_routes.go",
"gce_staticip.go",
"gce_targetproxy.go",
"gce_urlmap.go",
"gce_util.go",
"gce_zones.go",
"metrics.go",
"token_source.go",
],
tags = ["automanaged"],
Expand All @@ -43,13 +43,11 @@ go_library(
"//vendor/cloud.google.com/go/compute/metadata:go_default_library",
"//vendor/github.com/golang/glog:go_default_library",
"//vendor/github.com/prometheus/client_golang/prometheus:go_default_library",
"//vendor/golang.org/x/net/context:go_default_library",
"//vendor/golang.org/x/oauth2:go_default_library",
"//vendor/golang.org/x/oauth2/google:go_default_library",
"//vendor/google.golang.org/api/compute/v0.alpha:go_default_library",
"//vendor/google.golang.org/api/compute/v1:go_default_library",
"//vendor/google.golang.org/api/container/v1:go_default_library",
"//vendor/google.golang.org/api/gensupport:go_default_library",
"//vendor/google.golang.org/api/googleapi:go_default_library",
"//vendor/gopkg.in/gcfg.v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
Expand Down
39 changes: 1 addition & 38 deletions pkg/cloudprovider/providers/gce/gce.go
Expand Up @@ -25,7 +25,6 @@ import (
"time"

"cloud.google.com/go/compute/metadata"
"golang.org/x/net/context"

"gopkg.in/gcfg.v1"

Expand All @@ -39,7 +38,6 @@ import (
computealpha "google.golang.org/api/compute/v0.alpha"
compute "google.golang.org/api/compute/v1"
container "google.golang.org/api/container/v1"
"google.golang.org/api/gensupport"
)

const (
Expand Down Expand Up @@ -103,47 +101,12 @@ type Config struct {
}
}

// ApiWithNamespace stores api and namespace in context
type apiWithNamespace struct {
namespace string
apiCall string
}

func init() {
registerMetrics()
cloudprovider.RegisterCloudProvider(
ProviderName,
func(config io.Reader) (cloudprovider.Interface, error) {
return newGCECloud(config)
})
gensupport.RegisterHook(trackAPILatency)
}

func trackAPILatency(ctx context.Context, req *http.Request) func(resp *http.Response) {
requestTime := time.Now()
t := ctx.Value("kube-api-namespace")
apiNamespace, ok := t.(apiWithNamespace)

if !ok {
return nil
}

apiResponseReceived := func(resp *http.Response) {
timeTaken := time.Since(requestTime).Seconds()
if mi, ok := gceMetricMap[apiNamespace.apiCall]; ok {
mi.WithLabelValues(apiNamespace.namespace).Observe(timeTaken)
}
}
return apiResponseReceived
}

func contextWithNamespace(namespace string, apiCall string) context.Context {
rootContext := context.Background()
apiNamespace := apiWithNamespace{
namespace: namespace,
apiCall: apiCall,
}
return context.WithValue(rootContext, "kube-api-namespace", apiNamespace)
}

// Raw access to the underlying GCE service, probably should only be used for e2e tests
Expand Down Expand Up @@ -340,7 +303,7 @@ func getNetworkNameViaAPICall(svc *compute.Service, projectID string) (string, e
}

if networkList == nil || len(networkList.Items) <= 0 {
return "", fmt.Errorf("GCE Network List call returned no networks for project %q.", projectID)
return "", fmt.Errorf("GCE Network List call returned no networks for project %q", projectID)
}

return networkList.Items[0].Name, nil
Expand Down
26 changes: 19 additions & 7 deletions pkg/cloudprovider/providers/gce/gce_backendservice.go
Expand Up @@ -18,11 +18,17 @@ package gce

import (
"net/http"
"time"

compute "google.golang.org/api/compute/v1"
)

// BackendService Management
func newBackendServiceMetricContext(request string) *metricContext {
return &metricContext{
start: time.Now(),
attributes: []string{"backendservice_" + request, unusedLabel, unusedLabel},
}
}

// GetBackendService retrieves a backend by name.
func (gce *GCECloud) GetBackendService(name string) (*compute.BackendService, error) {
Expand All @@ -31,32 +37,38 @@ func (gce *GCECloud) GetBackendService(name string) (*compute.BackendService, er

// UpdateBackendService applies the given BackendService as an update to an existing service.
func (gce *GCECloud) UpdateBackendService(bg *compute.BackendService) error {
mc := newBackendServiceMetricContext("update")
op, err := gce.service.BackendServices.Update(gce.projectID, bg.Name, bg).Do()
if err != nil {
return err
return mc.Observe(err)
}
return gce.waitForGlobalOp(op)

return gce.waitForGlobalOp(op, mc)
}

// DeleteBackendService deletes the given BackendService by name.
func (gce *GCECloud) DeleteBackendService(name string) error {
mc := newBackendServiceMetricContext("delete")
op, err := gce.service.BackendServices.Delete(gce.projectID, name).Do()
if err != nil {
if isHTTPErrorCode(err, http.StatusNotFound) {
return nil
}
return err
return mc.Observe(err)
}
return gce.waitForGlobalOp(op)

return gce.waitForGlobalOp(op, mc)
}

// CreateBackendService creates the given BackendService.
func (gce *GCECloud) CreateBackendService(bg *compute.BackendService) error {
mc := newBackendServiceMetricContext("create")
op, err := gce.service.BackendServices.Insert(gce.projectID, bg).Do()
if err != nil {
return err
return mc.Observe(err)
}
return gce.waitForGlobalOp(op)

return gce.waitForGlobalOp(op, mc)
}

// ListBackendServices lists all backend services in the project.
Expand Down
26 changes: 20 additions & 6 deletions pkg/cloudprovider/providers/gce/gce_cert.go
Expand Up @@ -18,11 +18,17 @@ package gce

import (
"net/http"
"time"

compute "google.golang.org/api/compute/v1"
)

// SSL Certificate management
func newCertMetricContext(request string) *metricContext {
return &metricContext{
start: time.Now(),
attributes: []string{"cert_" + request, unusedLabel, unusedLabel},
}
}

// GetSslCertificate returns the SslCertificate by name.
func (gce *GCECloud) GetSslCertificate(name string) (*compute.SslCertificate, error) {
Expand All @@ -31,26 +37,34 @@ func (gce *GCECloud) GetSslCertificate(name string) (*compute.SslCertificate, er

// CreateSslCertificate creates and returns a SslCertificate.
func (gce *GCECloud) CreateSslCertificate(sslCerts *compute.SslCertificate) (*compute.SslCertificate, error) {
mc := newCertMetricContext("create")
op, err := gce.service.SslCertificates.Insert(gce.projectID, sslCerts).Do()

if err != nil {
return nil, err
return nil, mc.Observe(err)
}
if err = gce.waitForGlobalOp(op); err != nil {
return nil, err

if err = gce.waitForGlobalOp(op, mc); err != nil {
return nil, mc.Observe(err)
}

return gce.GetSslCertificate(sslCerts.Name)
}

// DeleteSslCertificate deletes the SslCertificate by name.
func (gce *GCECloud) DeleteSslCertificate(name string) error {
mc := newCertMetricContext("delete")
op, err := gce.service.SslCertificates.Delete(gce.projectID, name).Do()

if err != nil {
if isHTTPErrorCode(err, http.StatusNotFound) {
return nil
}
return err

return mc.Observe(err)
}
return gce.waitForGlobalOp(op)

return gce.waitForGlobalOp(op, mc)
}

// ListSslCertificates lists all SslCertificates in the project.
Expand Down
53 changes: 34 additions & 19 deletions pkg/cloudprovider/providers/gce/gce_disks.go
Expand Up @@ -22,6 +22,7 @@ import (
"net/http"
"path"
"strings"
"time"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
Expand Down Expand Up @@ -84,6 +85,13 @@ type GCEDisk struct {
Type string
}

func newDiskMetricContext(request, zone string) *metricContext {
return &metricContext{
start: time.Now(),
attributes: []string{"disk_" + request, unusedLabel, zone},
}
}

func (gce *GCECloud) AttachDisk(diskName string, nodeName types.NodeName, readOnly bool) error {
instanceName := mapNodeNameToInstanceName(nodeName)
instance, err := gce.getInstanceByName(instanceName)
Expand All @@ -99,13 +107,16 @@ func (gce *GCECloud) AttachDisk(diskName string, nodeName types.NodeName, readOn
readWrite = "READ_ONLY"
}
attachedDisk := gce.convertDiskToAttachedDisk(disk, readWrite)
dc := contextWithNamespace(diskName, "gce_attach_disk")
attachOp, err := gce.service.Instances.AttachDisk(gce.projectID, disk.Zone, instance.Name, attachedDisk).Context(dc).Do()

mc := newDiskMetricContext("attach", instance.Zone)
attachOp, err := gce.service.Instances.AttachDisk(
gce.projectID, disk.Zone, instance.Name, attachedDisk).Do()

if err != nil {
return err
return mc.Observe(err)
}

return gce.waitForZoneOp(attachOp, disk.Zone)
return gce.waitForZoneOp(attachOp, disk.Zone, mc)
}

func (gce *GCECloud) DetachDisk(devicePath string, nodeName types.NodeName) error {
Expand All @@ -123,13 +134,14 @@ func (gce *GCECloud) DetachDisk(devicePath string, nodeName types.NodeName) erro

return fmt.Errorf("error getting instance %q", instanceName)
}
dc := contextWithNamespace(devicePath, "gce_detach_disk")
detachOp, err := gce.service.Instances.DetachDisk(gce.projectID, inst.Zone, inst.Name, devicePath).Context(dc).Do()

mc := newDiskMetricContext("detach", inst.Zone)
detachOp, err := gce.service.Instances.DetachDisk(gce.projectID, inst.Zone, inst.Name, devicePath).Do()
if err != nil {
return err
return mc.Observe(err)
}

return gce.waitForZoneOp(detachOp, inst.Zone)
return gce.waitForZoneOp(detachOp, inst.Zone, mc)
}

func (gce *GCECloud) DiskIsAttached(diskName string, nodeName types.NodeName) (bool, error) {
Expand Down Expand Up @@ -193,7 +205,9 @@ func (gce *GCECloud) DisksAreAttached(diskNames []string, nodeName types.NodeNam
// CreateDisk creates a new Persistent Disk, with the specified name &
// size, in the specified zone. It stores specified tags encoded in
// JSON in Description field.
func (gce *GCECloud) CreateDisk(name string, diskType string, zone string, sizeGb int64, tags map[string]string) error {
func (gce *GCECloud) CreateDisk(
name string, diskType string, zone string, sizeGb int64, tags map[string]string) error {

// Do not allow creation of PDs in zones that are not managed. Such PDs
// then cannot be deleted by DeleteDisk.
isManaged := false
Expand Down Expand Up @@ -228,13 +242,14 @@ func (gce *GCECloud) CreateDisk(name string, diskType string, zone string, sizeG
Description: tagsStr,
Type: diskTypeUri,
}
dc := contextWithNamespace(name, "gce_disk_insert")
createOp, err := gce.service.Disks.Insert(gce.projectID, zone, diskToCreate).Context(dc).Do()

mc := newDiskMetricContext("create", zone)
createOp, err := gce.service.Disks.Insert(gce.projectID, zone, diskToCreate).Do()
if err != nil {
return err
return mc.Observe(err)
}

err = gce.waitForZoneOp(createOp, zone)
err = gce.waitForZoneOp(createOp, zone, mc)
if isGCEError(err, "alreadyExists") {
glog.Warningf("GCE PD %q already exists, reusing", name)
return nil
Expand Down Expand Up @@ -304,8 +319,7 @@ func (gce *GCECloud) GetAutoLabelsForPD(name string, zone string) (map[string]st
// Returns a GCEDisk for the disk, if it is found in the specified zone.
// If not found, returns (nil, nil)
func (gce *GCECloud) findDiskByName(diskName string, zone string) (*GCEDisk, error) {
dc := contextWithNamespace(diskName, "gce_list_disk")
disk, err := gce.service.Disks.Get(gce.projectID, zone, diskName).Context(dc).Do()
disk, err := gce.service.Disks.Get(gce.projectID, zone, diskName).Do()
if err == nil {
d := &GCEDisk{
Zone: lastComponent(disk.Zone),
Expand Down Expand Up @@ -390,13 +404,14 @@ func (gce *GCECloud) doDeleteDisk(diskToDelete string) error {
return err
}

dc := contextWithNamespace(diskToDelete, "gce_disk_delete")
deleteOp, err := gce.service.Disks.Delete(gce.projectID, disk.Zone, disk.Name).Context(dc).Do()
mc := newDiskMetricContext("delete", disk.Zone)

deleteOp, err := gce.service.Disks.Delete(gce.projectID, disk.Zone, disk.Name).Do()
if err != nil {
return err
return mc.Observe(err)
}

return gce.waitForZoneOp(deleteOp, disk.Zone)
return gce.waitForZoneOp(deleteOp, disk.Zone, mc)
}

// Converts a Disk resource to an AttachedDisk resource.
Expand Down

0 comments on commit 3f16302

Please sign in to comment.