In [58]:
import tensorflow_data_validation as tfdv
import pandas as pd
import datetime

from tensorflow_data_validation.utils import slicing_util
from typing import List, Optional, Text, Union, Dict, Iterable, Mapping

from tensorflow_metadata.proto.v0 import schema_pb2

In [59]:
tfdv.__version__

'0.22.0'

## Configure environment settings

In [60]:
base_schema_path = 'gs://mlops-dev-workspace/drift-monitor/templates/schema/schema.pbtxt'
baseline_stats_path = 'gs://mlops-dev-workspace/drift-monitor/templates/baseline_stats/stats.pbtxt'
schema_path = 'gs://mlops-dev-workspace/drift-monitor/schema/schema.pbtxt'

In [61]:
schema = tfdv.load_schema_text(base_schema_path)
tfdv.display_schema(schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'Elevation',FLOAT,required,,-
'Aspect',FLOAT,required,,-
'Slope',FLOAT,required,,-
'Horizontal_Distance_To_Hydrology',FLOAT,required,,-
'Vertical_Distance_To_Hydrology',FLOAT,required,,-
'Horizontal_Distance_To_Roadways',FLOAT,required,,-
'Hillshade_9am',FLOAT,required,,-
'Hillshade_Noon',FLOAT,required,,-
'Hillshade_3pm',FLOAT,required,,-
'Horizontal_Distance_To_Fire_Points',FLOAT,required,,-


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'Wilderness_Area',"'Cache', 'Commanche', 'Neota', 'Rawah'"
'Soil_Type',"'7745', '7202', '7757', '7756', '7201', '4744', '4703', '7746', '7755', '7700', '8771', '4704', '4758', '8772', '2703', '2705', '7102', '8776', '2717', '6102', '2704', '8703', '7790', '7101', '2702', '6101', '7702', '7103', '6731', '8707', '7709', '7701', '5101', '4201', '2706'"


### Fine tune the schema

In [43]:
# Do your stuff
#for feature in schema.feature:
#    print(feature.name)
#path = tfdv.FeaturePath("Wilderness_Area")

In [44]:
#tfdv.set_domain(schema, 'Elevation',  schema_pb2.FloatDomain(name='Elevation', min=1000, max=3000))

In [63]:
tfdv.display_schema(schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'Elevation',FLOAT,required,,-
'Aspect',FLOAT,required,,-
'Slope',FLOAT,required,,-
'Horizontal_Distance_To_Hydrology',FLOAT,required,,-
'Vertical_Distance_To_Hydrology',FLOAT,required,,-
'Horizontal_Distance_To_Roadways',FLOAT,required,,-
'Hillshade_9am',FLOAT,required,,-
'Hillshade_Noon',FLOAT,required,,-
'Hillshade_3pm',FLOAT,required,,-
'Horizontal_Distance_To_Fire_Points',FLOAT,required,,-


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'Wilderness_Area',"'Cache', 'Commanche', 'Neota', 'Rawah'"
'Soil_Type',"'7745', '7202', '7757', '7756', '7201', '4744', '4703', '7746', '7755', '7700', '8771', '4704', '4758', '8772', '2703', '2705', '7102', '8776', '2717', '6102', '2704', '8703', '7790', '7101', '2702', '6101', '7702', '7103', '6731', '8707', '7709', '7701', '5101', '4201', '2706'"


### Save the updated schema

In [62]:
#tfdv.get_feature(schema, 'Wilderness_Area').skew_comparator.infinity_norm.threshold = 0.001

In [64]:
tfdv.write_schema_text(schema, schema_path)

## Run the job

In [77]:
project = 'mlops-dev-env'
log_table = 'data_validation.test1'
model = 'covertype_tf'
version = 'v3'
start_time = '2020-05-25T16:00:00'
end_time = '2020-05-25T16:30:00'
output_path = 'gs://mlops-dev-workspace/drift-monitor/output/test'
baseline_stats_file = 'gs://mlops-dev-workspace/drift-monitor/baseline_stats/stats.pbtxt'
time_window = '60m'

In [78]:
!python ../run.py \
--project={project} \
--request_response_log_table={log_table} \
--model={model} \
--version={version}\
--start_time={start_time} \
--end_time={end_time} \
--output_path={output_path} \
--schema_file={schema_path} \
--baseline_stats_file={baseline_stats_file} \
--time_window={time_window}

Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`
INFO:root:File gs://mlops-dev-workspace/drift-monitor/baseline_stats/stats.pbtxt did not look like a TFRecord. Try reading as a plain file.
INFO:root:Starting the request-response log analysis pipeline...
INFO:root:Missing pipeline option (runner). Executing pipeline using the default runner: DirectRunner.
INFO:apache_beam.runners.direct.direct_runner:Running pipeline with DirectRunner.
INFO:apache_beam.internal.gcp.auth:Setting socket default timeout to 60 seconds.
INFO:apache_beam.internal.gcp.auth:socket default timeout is 60.0 seconds.
INFO:oauth2client.transport:Attempting refresh to obtain initial access_token
INFO:apache_beam.io.gcp.bigquery_tools:Using location 'US' from table <TableReference
 datasetId: 'data_validation'
 projectId: 'mlops-dev-env'
 tableId: 'test1'> referenced by query 
        SELECT FORMAT_TIMESTAMP("%G-%m-%d %T", time) as time, raw_data
        FROM 
            `data_vali

## Analyze results

In [80]:
stats = tfdv.load_statistics(stats_path)
for dataset in stats.datasets:
    print(dataset.name)




In [83]:
anomalies_path = output_path + '/' + 'anomalies.pbtxt'
!gsutil ls {anomalies_path}

gs://mlops-dev-workspace/drift-monitor/output/test/anomalies.pbtxt


In [79]:
stats_path = output_path + '/' + 'stats.pb'
!gsutil ls {stats_path}

gs://mlops-dev-workspace/drift-monitor/output/test/stats.pb


In [84]:
anomalies = tfdv.load_anomalies_text(anomalies_path)
tfdv.display_anomalies(anomalies)

In [81]:
tfdv.visualize_statistics(stats)

In [82]:
all_examples_stats = tfdv.get_slice_stats(stats, "All Examples")
tfdv.visualize_statistics(all_examples_stats)

ValueError: Invalid slice key.

In [87]:
anomaly_list = list(anomalies.anomaly_info)

In [89]:
anomaly_list

[]