In [1]:
import tensorflow_data_validation as tfdv
import pandas as pd
import datetime

from tensorflow_data_validation.utils import slicing_util
from typing import List, Optional, Text, Union, Dict, Iterable, Mapping

from tensorflow_metadata.proto.v0 import schema_pb2

In [2]:
tfdv.__version__

'0.22.0'

In [3]:
schema_path = 'gs://mlops-dev-workspace/drift-monitor/schema/schema.pbtxt'

baseline_stats_path = 'gs://mlops-dev-workspace/drift-monitor/baseline_stats/stats.pbtxt'

stats_path = 'gs://mlops-dev-workspace/drift-monitor/output/test/stats.pb'

In [4]:
schema = tfdv.load_schema_text(schema_path)

In [5]:
for feature in schema.feature:
    print(feature.name)

Elevation
Aspect
Slope
Horizontal_Distance_To_Hydrology
Vertical_Distance_To_Hydrology
Horizontal_Distance_To_Roadways
Hillshade_9am
Hillshade_Noon
Hillshade_3pm
Horizontal_Distance_To_Fire_Points
Soil_Type
Wilderness_Area


In [18]:
new_feature = schema.feature.add()
new_feature.name = 'time_slice'
new_feature.type = schema_pb2.FeatureType.BYTES

In [17]:
for feature in schema.feature:
    print(feature.name, feature.type)

Elevation 3
Aspect 3
Slope 3
Horizontal_Distance_To_Hydrology 3
Vertical_Distance_To_Hydrology 3
Horizontal_Distance_To_Roadways 3
Hillshade_9am 3
Hillshade_Noon 3
Hillshade_3pm 3
Horizontal_Distance_To_Fire_Points 3
Soil_Type 1
Wilderness_Area 1
time_slice 1


In [13]:
path = tfdv.FeaturePath("Wilderness_Area")

In [14]:
tfdv

<tensorflow_data_validation.types.FeaturePath at 0x7f50e52b1510>

In [53]:
tfdv.visualize_statistics(stats)

ValueError: lhs_statistics proto contains multiple datasets. Only one dataset is currently supported.

In [58]:
for dataset in stats.datasets:
    print('************************')
    print('************************')
    print(dataset)

 low_value: 123.0
        high_value: 147.60000000000002
        sample_count: 2003.7656000000002
      }
      buckets {
        low_value: 147.60000000000002
        high_value: 172.20000000000002
        sample_count: 1665.5455999999997
      }
      buckets {
        low_value: 172.20000000000002
        high_value: 196.8
        sample_count: 1101.8456
      }
      buckets {
        low_value: 196.8
        high_value: 221.4
        sample_count: 462.9856
      }
      buckets {
        low_value: 221.4
        high_value: 246.0
        sample_count: 102.21759999999996
      }
    }
    histograms {
      buckets {
        high_value: 94.0
        sample_count: 751.6
      }
      buckets {
        low_value: 94.0
        high_value: 112.0
        sample_count: 751.6
      }
      buckets {
        low_value: 112.0
        high_value: 124.0
        sample_count: 751.6
      }
      buckets {
        low_value: 124.0
        high_value: 134.0
        sample_count: 751.6
      }
  

In [57]:
stats

 avg_num_values: 1.0
        num_values_histogram {
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 751.6
          }
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 751.6
          }
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 751.6
          }
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 751.6
          }
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 751.6
          }
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 751.6
          }
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 751.6
          }
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 751.6
        

## Run the job

In [6]:
project = 'mlops-dev-env'
log_table = 'data_validation.test1'
model = 'covertype_tf'
version = 'v3'
start_time = '2020-05-25T16:01:10'
end_time = '2020-05-25T22:50:30'
output_path = 'gs://mlops-dev-workspace/drift-monitor/output/test'
schema_file = 'gs://mlops-dev-workspace/drift-monitor/schema/schema.pbtxt'
baseline_stats_file = 'gs://mlops-dev-workspace/drift-monitor/baseline_stats/stats.pbtxt'
time_window = '60m'

In [8]:
!gsutil ls {schema_file}

gs://mlops-dev-workspace/drift-monitor/schema/schema.pbtxt


In [9]:
!python ../run.py \
--project={project} \
--request_response_log_table={log_table} \
--model={model} \
--version={version}\
--start_time={start_time} \
--end_time={end_time} \
--output_path={output_path} \
--schema_file={schema_file} \
--baseline_stats_file={baseline_stats_file} \
--time_window={time_window}

Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`
INFO:root:File gs://mlops-dev-workspace/drift-monitor/baseline_stats/stats.pbtxt did not look like a TFRecord. Try reading as a plain file.
INFO:root:Starting the request-response log analysis pipeline...
INFO:root:Missing pipeline option (runner). Executing pipeline using the default runner: DirectRunner.
INFO:apache_beam.runners.direct.direct_runner:Running pipeline with DirectRunner.
INFO:apache_beam.internal.gcp.auth:Setting socket default timeout to 60 seconds.
INFO:apache_beam.internal.gcp.auth:socket default timeout is 60.0 seconds.
INFO:oauth2client.transport:Attempting refresh to obtain initial access_token
INFO:apache_beam.io.gcp.bigquery_tools:Using location 'US' from table <TableReference
 datasetId: 'data_validation'
 projectId: 'mlops-dev-env'
 tableId: 'test1'> referenced by query 
        SELECT FORMAT_TIMESTAMP("%G-%m-%d %T", time) as time, raw_data
        FROM 
            `data_vali