In [28]:
# import required libs
import glob
import os

import tensorflow as tf
import tensorflow_data_validation as tfdv
print('TF version: {}'.format(tf.version.VERSION))
print('TFDV version: {}'.format(tfdv.version.__version__))

TF version: 2.11.0
TFDV version: 1.12.0


In [29]:
# Read artifact information from metadata store.
from pipeline import configs

from tfx.orchestration import metadata
from tfx.types import standard_artifacts

metadata_connection_config = metadata.sqlite_metadata_connection_config(
              configs.METADATA_PATH)
with metadata.Metadata(metadata_connection_config) as store:
    example_artifacts = store.get_artifacts_by_type(standard_artifacts.Examples.TYPE_NAME)
    stats_artifacts = store.get_artifacts_by_type(standard_artifacts.ExampleStatistics.TYPE_NAME)
    schema_artifacts = store.get_artifacts_by_type(standard_artifacts.Schema.TYPE_NAME)
    anomalies_artifacts = store.get_artifacts_by_type(standard_artifacts.ExampleAnomalies.TYPE_NAME)

In [30]:
# Example pipeline output path
example_tfrecord = example_artifacts[0].uri
train_example_tfrecord = os.path.join(example_tfrecord, 'Split-train', 'data_tfrecord-00000-of-00001.gz')
eval_example_tfrecord = os.path.join(example_tfrecord, 'Split-eval', 'data_tfrecord-00000-of-00001.gz')
print("Train example tfrecord file: {} \nEval example tfrecord file: {}".format(
    train_example_tfrecord, eval_example_tfrecord))

# Stats pipeline output path
stats_path = stats_artifacts[0].uri
train_stats_file = os.path.join(stats_path, 'Split-train', 'FeatureStats.pb')
eval_stats_file = os.path.join(stats_path, 'Split-eval', 'FeatureStats.pb')
print("Train stats file:{} \nEval stats file:{}".format(
    train_stats_file, eval_stats_file))

# Schema path
schema_file = os.path.join(schema_artifacts[0].uri, 'schema.pbtxt')
print("Generated schema file:{}".format(schema_file))

# Anomalies path
train_anomalies_file = os.path.join(anomalies_artifacts[0].uri, 'Split-train', 'SchemaDiff.pb')
eval_anomalies_file = os.path.join(anomalies_artifacts[0].uri, 'Split-eval', 'SchemaDiff.pb')
print("Train anomalies file:{} \nEval anomalies file:{}".format(
    train_anomalies_file, eval_anomalies_file))

Train example tfrecord file: ./tfx_pipeline_output/vector-search-pipeline/BigQueryExampleGen/examples/1/Split-train/data_tfrecord-00000-of-00001.gz 
Eval example tfrecord file: ./tfx_pipeline_output/vector-search-pipeline/BigQueryExampleGen/examples/1/Split-eval/data_tfrecord-00000-of-00001.gz
Train stats file:./tfx_pipeline_output/vector-search-pipeline/StatisticsGen/statistics/3/Split-train/FeatureStats.pb 
Eval stats file:./tfx_pipeline_output/vector-search-pipeline/StatisticsGen/statistics/3/Split-eval/FeatureStats.pb
Generated schema file:./tfx_pipeline_output/vector-search-pipeline/SchemaGen/schema/4/schema.pbtxt
Train anomalies file:./tfx_pipeline_output/vector-search-pipeline/ExampleValidator/anomalies/5/Split-train/SchemaDiff.pb 
Eval anomalies file:./tfx_pipeline_output/vector-search-pipeline/ExampleValidator/anomalies/5/Split-eval/SchemaDiff.pb


In [31]:
train_dataset = tf.data.TFRecordDataset([train_example_tfrecord], compression_type='GZIP')
eval_dataset = tf.data.TFRecordDataset([train_example_tfrecord], compression_type='GZIP')

In [32]:
# Dataset schema
schema = {
    "id": tf.io.FixedLenFeature([], dtype=tf.int64),
    "title": tf.io.FixedLenFeature([], dtype=tf.string),
    "body": tf.io.FixedLenFeature([], dtype=tf.string),
    "tags": tf.io.FixedLenFeature([], dtype=tf.string)
    }

# Parse example tfrecord bytes
def decode_single_fn(single_record_bytes):
  return tf.io.parse_single_example(single_record_bytes, schema)

def decode_batch_fn(batch_record_bytes):
  return tf.io.parse_example(batch_record_bytes, schema)

In [37]:
# Print dataset sample

for single_record in train_dataset.map(decode_single_fn).take(1):
    for key in schema.keys():
        print(key, ": ", single_record[key].numpy())

print("\n --------------------------------------------------------- \n")

for batch_record in train_dataset.batch(5).map(decode_batch_fn).take(1):
    for key in schema.keys():
        print(key, ": ", batch_record[key].numpy())

id :  1829901
title :  b'Pointers in c (how to point to the first char in a string with a pointer pointing somewhere else in the same string)'
body :  b"<p>If I have a pointer that is pointing somewhere in a string, let's say it is pointing at the third letter (we do not know the letter position, basically we don't know it is the third letter), and we want it to point back to the first letter so we can make the string to be NULL how do we do that?</p>\n\n<p><strong>For example:</strong></p>\n\n<p>if we have <code>ascii</code> as a pointer\n<code>ascii</code> is pointing now somewhere in the string, and i want it to point at the first char of the string how do i do that?</p>\n\n<p>(Note:\nI tried saying</p>\n\n<pre><code>int len = strlen(ascii);\nascii -= len;\nascii = '0';\n</code></pre>\n\n<p>but it is not working, it changes wherever the pointer is to 0 but not the first char to 0)</p>"
tags :  b'c'

 --------------------------------------------------------- 

id :  [1829901 1482521 

In [38]:
# load generated statistics from StatisticsGen
train_stats = tfdv.load_stats_binary(train_stats_file)   #train_stats = tfdv.load_statistics(train_stats_file)
eval_stats = tfdv.load_stats_binary(eval_stats_file)     #eval_stats = tfdv.load_statistics(eval_stats_file)
tfdv.visualize_statistics(lhs_statistics=eval_stats, rhs_statistics=train_stats,
                          lhs_name='EVAL_DATASET', rhs_name='TRAIN_DATASET')

In [39]:
# load generated schema from SchemaGen
schema = tfdv.load_schema_text(schema_file)
tfdv.display_schema(schema=schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'body',BYTES,required,,-
'id',INT,required,,-
'tags',BYTES,required,,-
'title',BYTES,required,,-


In [40]:
# load data vaildation result from ExampleValidator
anomalies = tfdv.utils.anomalies_util.load_anomalies_binary(train_anomalies_file)
tfdv.display_anomalies(anomalies)