In [3]:
%pip install --user tensorflow-data-validation

Collecting tensorflow-data-validation
  Using cached tensorflow_data_validation-1.6.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.4 MB)
Collecting pyarrow<6,>=1
  Using cached pyarrow-5.0.0-cp37-cp37m-manylinux2014_x86_64.whl (23.6 MB)
Collecting joblib<0.15,>=0.12
  Using cached joblib-0.14.1-py2.py3-none-any.whl (294 kB)
Collecting numpy<2,>=1.16
  Using cached numpy-1.20.3-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (15.3 MB)
Collecting dill<0.3.2,>=0.3.1.1
  Using cached dill-0.3.1.1-py3-none-any.whl
Collecting httplib2<0.20.0,>=0.8
  Using cached httplib2-0.19.1-py3-none-any.whl (95 kB)
Collecting typing-extensions<4,>=3.7.0
  Using cached typing_extensions-3.10.0.2-py3-none-any.whl (26 kB)
Collecting google-cloud-spanner<2,>=1.13.0
  Using cached google_cloud_spanner-1.19.1-py2.py3-none-any.whl (255 kB)
Collecting google-cloud-vision<2,>=0.38.0
  Using cached google_cloud_vision-1.0.0-py2.py3-none-any.whl (435 kB)
Collecting google-cloud-bigtable<2,

In [2]:
import tensorflow_data_validation as tfdv

In [27]:
from google.cloud import aiplatform
from google.cloud import aiplatform_v1
import os

In [4]:
pipeline_name = "ihr-my-pipeline"
project_id = "ihr-vertex-pipelines"
region = "europe-west4"
aiplatform.init(project=project_id, location=region)

In [5]:
df = aiplatform.get_pipeline_df(pipeline=pipeline_name)

In [6]:
df

Unnamed: 0,pipeline_name,run_name
0,ihr-my-pipeline,ihr-my-pipeline-20220228110721
1,ihr-my-pipeline,ihr-my-pipeline-20220228104837
2,ihr-my-pipeline,ihr-my-pipeline-20220228103007


In [7]:
API_ENDPOINT = "{}-aiplatform.googleapis.com".format(region)
metadata_client = aiplatform_v1.MetadataServiceClient(
  client_options={
      "api_endpoint": API_ENDPOINT
  }
)

In [10]:
LIVE_FILTER = "create_time > \"2022-02-27T21:30:00+10:00\" AND state = LIVE"

artifact_request = aiplatform_v1.ListArtifactsRequest(
    parent="projects/{0}/locations/{1}/metadataStores/default".format(project_id, region),
	filter=LIVE_FILTER)

In [12]:
artifacts = metadata_client.list_artifacts(artifact_request)

## Visualizing statistics

In [24]:
stats_artifacts = [a for a in artifacts if a.display_name == 'statistics' and pipeline_name in a.uri]

In [25]:
my_uri = stats_artifacts[0].uri

In [26]:
my_uri

'gs://ihr-live-workshop/pipeline/237148598933/ihr-my-pipeline-20220228110721/StatisticsGen_8909936456147927040/statistics'

In [28]:
train_uri = os.path.join(my_uri, "Split-train/FeatureStats.pb")
eval_uri = os.path.join(my_uri, "Split-eval/FeatureStats.pb")

In [29]:
train_uri

'gs://ihr-live-workshop/pipeline/237148598933/ihr-my-pipeline-20220228110721/StatisticsGen_8909936456147927040/statistics/Split-train/FeatureStats.pb'

In [30]:
train_stats = tfdv.load_stats_binary(train_uri)

In [32]:
tfdv.visualize_statistics(train_stats)

In [33]:
eval_stats = tfdv.load_stats_binary(eval_uri)
tfdv.visualize_statistics(eval_stats)

## Check the schema

In [34]:
schemas = [a for a in artifacts if a.display_name == 'schema' and pipeline_name in a.uri]

In [45]:
schemas[0].uri

'gs://ihr-live-workshop/pipeline/237148598933/ihr-my-pipeline-20220228110721/SchemaGen_-4925121599134236672/schema'

In [38]:
schema_uri = os.path.join(schemas[0].uri, "schema.pbtxt")

In [46]:
schema_uri

'gs://ihr-live-workshop/pipeline/237148598933/ihr-my-pipeline-20220228110721/SchemaGen_-4925121599134236672/schema/schema.pbtxt'

In [51]:
!gsutil ls {schema_uri}

gs://ihr-live-workshop/pipeline/237148598933/ihr-my-pipeline-20220228110721/SchemaGen_-4925121599134236672/schema/schema.pbtxt


In [52]:
s = tfdv.load_schema_text("gs://ihr-live-workshop/pipeline/237148598933/ihr-my-pipeline-20220228110721/SchemaGen_-4925121599134236672/schema/schema.pbtxt")

In [53]:
type(s)

tensorflow_metadata.proto.v0.schema_pb2.Schema

In [54]:
tfdv.display_schema(s)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'Amount',FLOAT,required,,-
'Class',INT,required,,-
'Time',INT,required,,-
'V1',FLOAT,required,,-
'V10',FLOAT,required,,-
'V11',FLOAT,required,,-
'V12',FLOAT,required,,-
'V13',FLOAT,required,,-
'V14',FLOAT,required,,-
'V15',FLOAT,required,,-


## Detect anomalies

In [55]:
train_anomalies = tfdv.validate_statistics(statistics=train_stats, schema=s)

In [56]:
tfdv.display_anomalies(train_anomalies)

In [57]:
eval_anomalies = tfdv.validate_statistics(statistics=eval_stats, schema=s)

In [58]:
tfdv.display_anomalies(eval_anomalies)