In [1]:
# import required libs
import glob
import os
import sys

import tensorflow as tf
import tensorflow_data_validation as tfdv

target_dir = os.path.dirname(os.getcwd())
target_dir_fraud = target_dir + "/fraud"
sys.path.extend([target_dir, target_dir_fraud])

In [2]:
# Read artifact information from metadata store.
from fraud.local_runner import METADATA_PATH

from tfx.orchestration import metadata
from tfx.types import standard_artifacts

metadata_connection_config = metadata.sqlite_metadata_connection_config(METADATA_PATH)
with metadata.Metadata(metadata_connection_config) as store:
    stats_artifacts = store.get_artifacts_by_type(standard_artifacts.ExampleStatistics.TYPE_NAME)
    schema_artifacts = store.get_artifacts_by_type(standard_artifacts.Schema.TYPE_NAME)
    anomalies_artifacts = store.get_artifacts_by_type(standard_artifacts.ExampleAnomalies.TYPE_NAME)


In [21]:
# configure output paths
# Exact paths to output artifacts can also be found on KFP Web UI if you are using kubeflow.
stats_path = [entry for entry in stats_artifacts if "StatisticsGen" in entry.uri][-1].uri
train_stats_file = os.path.join(stats_path, 'Split-train', 'FeatureStats.pb')
eval_stats_file = os.path.join(stats_path, 'Split-eval', 'FeatureStats.pb')
print("Train stats file:{}, Eval stats file:{}".format(train_stats_file, eval_stats_file))

schema_file = os.path.join(schema_artifacts[-1].uri, 'schema.pbtxt')
print("Generated schame file:{}".format(schema_file))
anomalies_file = os.path.join(anomalies_artifacts[-1].uri, 'anomalies.pbtxt')
print("Generated anomalies file:{}".format(anomalies_file))

Train stats file:/Users/ivan/Documents/dev/learn/tfx/pipeline_outputs/tfx_pipeline_output/fraud/StatisticsGen/statistics/151/Split-train/FeatureStats.pb, Eval stats file:/Users/ivan/Documents/dev/learn/tfx/pipeline_outputs/tfx_pipeline_output/fraud/StatisticsGen/statistics/151/Split-eval/FeatureStats.pb
Generated schame file:/Users/ivan/Documents/dev/learn/tfx/pipeline_outputs/tfx_pipeline_output/fraud/Transform/post_transform_schema/154/schema.pbtxt
Generated anomalies file:/Users/ivan/Documents/dev/learn/tfx/pipeline_outputs/tfx_pipeline_output/fraud/Transform/post_transform_anomalies/154/anomalies.pbtxt


In [22]:
# load generated statistics from StatisticsGen
#stats = tfdv.load_statistics(stats_file)
tfdv.load_stats_binary
train_stats = tfdv.load_stats_binary(train_stats_file)
eval_stats = tfdv.load_stats_binary(eval_stats_file)
tfdv.visualize_statistics(
    lhs_statistics=train_stats,
    rhs_statistics=eval_stats,
    lhs_name='train',
    rhs_name='eval',
)

In [23]:
# load generated schema from SchemaGen
schema = tfdv.load_schema_text(schema_file)
tfdv.display_schema(schema=schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'AMT_CREDIT_xf',FLOAT,required,,-
'AMT_INCOME_TOTAL_xf',INT,required,,-
'CNT_CHILDREN_xf',FLOAT,required,,-
'FLAG_MOBIL_xf',INT,required,,-
'NAME_TYPE_SUITE_xf',INT,required,,-
'TARGET_xf',INT,required,,-


In [27]:
# load data validation result from ExampleValidator
try:
    anomalies = tfdv.load_anomalies_text(anomalies_file)
    tfdv.display_anomalies(anomalies)
except:
    print('no anomalies found')

no anomalies found
