In [1]:
from google.cloud import monitoring_v3
from google.cloud.monitoring_v3.query import Query
import datetime
import time

import requests

In [7]:
_GCP_METADATA_URI = 'http://metadata.google.internal/computeMetadata/v1/'
_GCP_METADATA_URI_HEADER = {'Metadata-Flavor': 'Google'}

In [22]:
response = requests.get(_GCP_METADATA_URI + 'instance/id', headers=_GCP_METADATA_URI_HEADER)

In [23]:
response.text

'284365999706661199'

In [5]:
type(monitoring_v3.enums.MetricDescriptor.ValueType.INT64)

<enum 'ValueType'>

In [6]:
point = monitoring_v3.types.Point()

In [7]:
type(point)

google.cloud.monitoring_v3.types.Point

In [11]:
dir(point.value)

['ByteSize',
 'Clear',
 'ClearExtension',
 'ClearField',
 'CopyFrom',
 'DESCRIPTOR',
 'DiscardUnknownFields',
 'Extensions',
 'FindInitializationErrors',
 'FromString',
 'HasExtension',
 'HasField',
 'IsInitialized',
 'ListFields',
 'MergeFrom',
 'MergeFromString',
 'ParseFromString',
 'RegisterExtension',
 'SerializePartialToString',
 'SerializeToString',
 'SetInParent',
 'UnknownFields',
 'WhichOneof',
 '_CheckCalledFromGeneratedFile',
 '_SetListener',
 '__class__',
 '__deepcopy__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '_extensions_by_name',
 '_extensions_by_number',
 'bool_value',
 'distribution_value',
 'double_value',
 'int64_value'

In [2]:
client = monitoring_v3.MetricServiceClient()
project_name = f'projects/jk-mlops-dev'
filter = 'metric.type = starts_with("custom.googleapis.com/gce/gpu-test")'


In [3]:
for descriptor in client.list_metric_descriptors(name=project_name, filter_=filter):
    print(f'Descriptor: {descriptor.name}')
    print(descriptor)

Descriptor: projects/jk-mlops-dev/metricDescriptors/custom.googleapis.com/gce/gpu-test/fp32_active
name: "projects/jk-mlops-dev/metricDescriptors/custom.googleapis.com/gce/gpu-test/fp32_active"
labels {
  key: "gpu"
}
metric_kind: GAUGE
value_type: DOUBLE
unit: "ratio"
description: "Ratio of cycles the FP32 cores are active"
type: "custom.googleapis.com/gce/gpu-test/fp32_active"

Descriptor: projects/jk-mlops-dev/metricDescriptors/custom.googleapis.com/gce/gpu-test/gr_engine_active
name: "projects/jk-mlops-dev/metricDescriptors/custom.googleapis.com/gce/gpu-test/gr_engine_active"
labels {
  key: "gpu"
}
metric_kind: GAUGE
value_type: DOUBLE
unit: "ratio"
description: "Ratio of time the graphics engine is active"
type: "custom.googleapis.com/gce/gpu-test/gr_engine_active"

Descriptor: projects/jk-mlops-dev/metricDescriptors/custom.googleapis.com/gce/gpu-test/mem_used
name: "projects/jk-mlops-dev/metricDescriptors/custom.googleapis.com/gce/gpu-test/mem_used"
labels {
  key: "gpu"
}
metri

In [4]:
client.get_metric_descriptor(name='projects/jk-mlops-dev/metricDescriptors/custom.googleapis.com/gce/gpu/sm_active')

name: "projects/jk-mlops-dev/metricDescriptors/custom.googleapis.com/gce/gpu/sm_active"
labels {
  key: "gpu"
}
metric_kind: GAUGE
value_type: DOUBLE
unit: "ratio"
description: "Ratio of cycles an SM has at least 1 warp assigned"
type: "custom.googleapis.com/gce/gpu/sm_active"

In [28]:
def report_gpu_metric(value, seconds, nanos, type, instance_id, zone, project_id):
    series = monitoring_v3.types.TimeSeries()
    series.metric.type = 'custom.googleapis.com/{type}'.format(type=type)
    series.resource.type = 'gce_instance'
    series.resource.labels['instance_id'] = instance_id
    series.resource.labels['zone'] = zone
    series.resource.labels['project_id'] = project_id

    point = series.points.add()
    point.value.int64_value = value
    point.interval.end_time.seconds = seconds
    point.interval.end_time.nanos = nanos
    
    client.create_time_series(project_name, [series])
    

In [12]:
metric_type = 'gce/gpu/utilization'
instance_id = '284365999706661199'
zone = 'us-west1-b'
project_id = 'jk-mlops-dev'

now = time.time()
seconds = int(now) 
nanos = int(
        (now - seconds) * 10**9)

In [19]:
value = 94

result = report_gpu_metric(value=value,
                      seconds=seconds,
                      nanos=nanos,
                      type=metric_type,
                      instance_id=instance_id,
                      zone=zone,
                      project_id=project_id)

interval {
  end_time {
    seconds: 1616006401
    nanos: 425619602
  }
}
value {
  int64_value: 94
}

None


In [25]:
filter = 'metric.type = "custom.googleapis.com/gce/gpu/utilization"'

now = time.time()
seconds = int(now)
nanos = int((now - seconds)*10**9)
interval = monitoring_v3.types.TimeInterval(
        end_time = {"seconds": seconds, "nanos": nanos},
        start_time = {"seconds": (seconds - 480), "nanos": nanos}
)


responses = client.list_time_series(
        name=project_name,
        filter_=filter,
        interval=interval,
        view=monitoring_v3.types.ListTimeSeriesRequest.TimeSeriesView.FULL
)
results = list(responses)

for point in results[0].points:
    print(point.interval.end_time.seconds, point.value)

1616008881 int64_value: 73

1616008875 int64_value: 70



1616005598 int64_value: 91

1616005519 int64_value: 91

1616005509 int64_value: 65

1616005473 int64_value: 65

1616005408 int64_value: 65

1616005359 int64_value: 65

1616005315 int64_value: 65



In [19]:
for point in results[0].points:
   print(point)

TypeError: 'GRPCIterator' object is not subscriptable

In [9]:
dir(monitoring_v3.types)

['Aggregation',
 'AlertPolicy',
 'Any',
 'BasicSli',
 'BoolValue',
 'BytesValue',
 'CreateAlertPolicyRequest',
 'CreateGroupRequest',
 'CreateMetricDescriptorRequest',
 'CreateNotificationChannelRequest',
 'CreateServiceLevelObjectiveRequest',
 'CreateServiceRequest',
 'CreateTimeSeriesError',
 'CreateTimeSeriesRequest',
 'CreateTimeSeriesSummary',
 'CreateUptimeCheckConfigRequest',
 'DeleteAlertPolicyRequest',
 'DeleteGroupRequest',
 'DeleteMetricDescriptorRequest',
 'DeleteNotificationChannelRequest',
 'DeleteServiceLevelObjectiveRequest',
 'DeleteServiceRequest',
 'DeleteUptimeCheckConfigRequest',
 'Distribution',
 'DistributionCut',
 'DoubleValue',
 'DroppedLabels',
 'Duration',
 'Empty',
 'FieldMask',
 'FloatValue',
 'GetAlertPolicyRequest',
 'GetGroupRequest',
 'GetMetricDescriptorRequest',
 'GetMonitoredResourceDescriptorRequest',
 'GetNotificationChannelDescriptorRequest',
 'GetNotificationChannelRequest',
 'GetNotificationChannelVerificationCodeRequest',
 'GetNotificationChann

In [85]:
for descriptor in client.list_metric_descriptors(name=project_name, filter_=filter):
    print(f'Deleting: {descriptor.name}')
    client.delete_metric_descriptor(name=descriptor.name)

In [87]:
project_id = 'jk-mlops-dev'
metric_type = 'custom.googleapis.com/opencensus/gce/gpu/utilization_distribution'

In [88]:
START_TIME = datetime.datetime.fromisoformat('2021-01-22T04:00:00-00:00')
END_TIME = datetime.datetime.fromisoformat('2021-01-22T05:00:00-00:00')

In [91]:
query = Query(client, project_id, metric_type=metric_type)
query = query.select_interval(END_TIME, START_TIME)
#query = query.select_resources(instance_id='1297153725150009955')
#query = query.select_metrics(opencensus_task='py-13989@jk-test')

In [94]:
df = query.as_dataframe()
df

resource_type,gce_instance,gce_instance
project_id,jk-mlops-dev,jk-mlops-dev
zone,us-west1-b,us-west1-b
instance_id,5382047594078986292,5382047594078986292
device,0,1
2021-01-22 04:02:11.137948,count: 6\nbucket_options {\n explicit_buckets...,count: 6\nbucket_options {\n explicit_buckets...
2021-01-22 04:02:41.138088,count: 12\nbucket_options {\n explicit_bucket...,count: 12\nbucket_options {\n explicit_bucket...
2021-01-22 04:03:11.138221,count: 18\nbucket_options {\n explicit_bucket...,count: 18\nbucket_options {\n explicit_bucket...
2021-01-22 04:03:41.138345,count: 24\nbucket_options {\n explicit_bucket...,count: 24\nbucket_options {\n explicit_bucket...
2021-01-22 04:04:11.138576,count: 30\nbucket_options {\n explicit_bucket...,count: 30\nbucket_options {\n explicit_bucket...
2021-01-22 04:04:41.138740,count: 36\nbucket_options {\n explicit_bucket...,count: 36\nbucket_options {\n explicit_bucket...
2021-01-22 04:05:11.138878,count: 42\nbucket_options {\n explicit_bucket...,count: 42\nbucket_options {\n explicit_bucket...


In [27]:
from google.api_core import exceptions

In [30]:
initial_util = 70
wait_time = 8

now = time.time() - 60
seconds = int(now) 
nanos = int(
        (now - seconds) * 10**9)

for i in range(5):
    seconds += 2
    try: 
        report_gpu_metric(value=initial_util+i,
                      seconds=seconds,
                      nanos=nanos,
                      type=metric_type,
                      instance_id=instance_id,
                      zone=zone,
                      project_id=project_id)

    except exceptions.GoogleAPICallError as err:
        print(err)
    except exceptions.RetryError as err:
        print('Retry attempts to create time series failed')
    except Exception:    
        print('create_time_series: exception encountered')
        
    print('wrote metric')
    time.sleep(wait_time)

wrote metric
400 One or more TimeSeries could not be written: One or more points were written more frequently than the maximum sampling period configured for the metric.: timeSeries[0]
wrote metric
400 One or more TimeSeries could not be written: One or more points were written more frequently than the maximum sampling period configured for the metric.: timeSeries[0]
wrote metric
wrote metric
400 One or more TimeSeries could not be written: One or more points were written more frequently than the maximum sampling period configured for the metric.: timeSeries[0]
wrote metric
