In [1]:
import os
import tfx

print("TFX Version: {}".format(tfx.__version__))

TFX Version: 0.14.0


# Tensorflow Extended

## ExampleGen

The ExampleGen consumes data from the external source and emits the `tf.Example` for further usage.

In [3]:
from tfx.utils.dsl_utils import csv_input
from tfx.components.example_gen.csv_example_gen.component import CsvExampleGen
from tfx.proto import example_gen_pb2

In [None]:
csv_example = csv_input("/Users/jiankaiwang/devops/tfx_taxi/taxi/data/simple/")

output_config = example_gen_pb2.Output(split_config=example_gen_pb2.SplitConfig(splits=[
    example_gen_pb2.SplitConfig.Split(name="train", hash_buckets=3),
    example_gen_pb2.SplitConfig.Split(name="eval", hash_buckets=1)
]))

csv_example_gen = CsvExampleGen(input_base=csv_example, output_config=output_config)

In [None]:
csv_example_gen.outputs

## StatisticsGen

The StatisticsGen make the use of Tensorflow Data Validation (TFDV) for generating statistics from your datasets.

In [4]:
from tfx.components.statistics_gen.component import StatisticsGen

In [None]:
compute_eval_statistics = StatisticsGen(
    input_data=csv_example_gen.outputs['examples'],   # the key `examples` is the default one
    instance_name="compute-stats"
)

In [None]:
compute_eval_statistics.outputs

## SchemaGen

The SchemaGen infers the description of the input dataset. It can specify the data type, the allowed range of the data for the feature values, etc, to your training dataset.

In [5]:
from tfx.components.schema_gen.component import SchemaGen

In [None]:
infer_schema = SchemaGen(stats=compute_eval_statistics.outputs['output'])

In [None]:
infer_schema.outputs

## ExampleValidator

The ExampleValidator component pipeline identifies anomalies from your training and serving dataset.

In [None]:
from tfx.components.example_validator.component import ExampleValidator

In [None]:
validate_stat = ExampleValidator(
    stats=compute_eval_statistics.outputs["output"],
    schema=infer_schema.outputs["output"]
)

In [None]:
validate_stat

# Tensorflow Data Validation (TFDV)

In [6]:
import tensorflow_data_validation as tfdv

## Descriptive Statistics

### From CSV files

In [7]:
stats = tfdv.generate_statistics_from_csv(
    data_location="/Users/jiankaiwang/devops/tfx_taxi/taxi/data/simple/data.csv")


























Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


In [8]:
type(stats)

tensorflow_metadata.proto.v0.statistics_pb2.DatasetFeatureStatisticsList

### From TFRecord Files

In [19]:
# allow for image
semantic_doman = tfdv.StatsOptions(enable_semantic_domain_stats=True)

In [None]:
tfrecord_stats = tfdv.generate_statistics_from_tfrecord(
    data_location="/Users/jiankaiwang/Google_drives/public/document/201908_DL_ObjectDetection/tfrecords/train.tfrecords",
    stats_options=semantic_doman)

In [None]:
tfrecord_stats

## Visualization

In [9]:
tfdv.visualize_statistics(stats)

In [None]:
tfdv.visualize_statistics(tfrecord_stats)

## Schema generation

In [10]:
schema = tfdv.infer_schema(statistics=stats)

In [11]:
type(schema), schema

(tensorflow_metadata.proto.v0.schema_pb2.Schema, feature {
   name: "pickup_community_area"
   type: INT
   presence {
     min_fraction: 1.0
     min_count: 1
   }
   shape {
     dim {
       size: 1
     }
   }
 }
 feature {
   name: "fare"
   type: FLOAT
   presence {
     min_fraction: 1.0
     min_count: 1
   }
   shape {
     dim {
       size: 1
     }
   }
 }
 feature {
   name: "trip_start_month"
   type: INT
   presence {
     min_fraction: 1.0
     min_count: 1
   }
   shape {
     dim {
       size: 1
     }
   }
 }
 feature {
   name: "trip_start_hour"
   type: INT
   presence {
     min_fraction: 1.0
     min_count: 1
   }
   shape {
     dim {
       size: 1
     }
   }
 }
 feature {
   name: "trip_start_day"
   type: INT
   presence {
     min_fraction: 1.0
     min_count: 1
   }
   shape {
     dim {
       size: 1
     }
   }
 }
 feature {
   name: "trip_start_timestamp"
   type: INT
   presence {
     min_fraction: 1.0
     min_count: 1
   }
   shape {
     dim {
  

Edit the schema as you need.

In [None]:
tfdv.get_feature(schema, "payment_type").presence.min_fraction = 0.5

In [None]:
schema

In [12]:
tfdv.display_schema(schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'pickup_community_area',INT,required,,-
'fare',FLOAT,required,,-
'trip_start_month',INT,required,,-
'trip_start_hour',INT,required,,-
'trip_start_day',INT,required,,-
'trip_start_timestamp',INT,required,,-
'pickup_latitude',FLOAT,required,,-
'pickup_longitude',FLOAT,required,,-
'dropoff_latitude',FLOAT,optional,single,-
'dropoff_longitude',FLOAT,optional,single,-


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'payment_type',"'Cash', 'Credit Card', 'Dispute', 'No Charge', 'Pcard', 'Prcard', 'Unknown'"
'company',"'0118 - 42111 Godfrey S.Awir', '0694 - 59280 Chinesco Trans Inc', '1085 - 72312 N and W Cab Co', '2092 - 61288 Sbeih company', '2192 - 73487 Zeymane Corp', '2192 - Zeymane Corp', '2733 - 74600 Benny Jona', '2809 - 95474 C & D Cab Co Inc.', '2823 - 73307 Seung Lee', '3011 - 66308 JBL Cab Inc.', '3094 - 24059 G.L.B. Cab Co', '3152 - 97284 Crystal Abernathy', '3201 - C&D Cab Co Inc', '3201 - CID Cab Co Inc', '3253 - 91138 Gaither Cab Co.', '3319 - CD Cab Co', '3385 - 23210 Eman Cab', '3385 - Eman Cab', '3623 - 72222 Arrington Enterprises', '3897 - 57856 Ilie Malec', '3897 - Ilie Malec', '4053 - 40193 Adwar H. Nikola', '4053 - Adwar H. Nikola', '4197 - 41842 Royal Star', '4197 - Royal Star', '4615 - 83503 Tyrone Henderson', '4615 - Tyrone Henderson', '4623 - Jay Kim', '5006 - 39261 Salifu Bawa', '5006 - Salifu Bawa', '5074 - 54002 Ahzmi Inc', '5074 - Ahzmi Inc', '5129 - 87128', '5129 - 98755 Mengisti Taxi', '5129 - Mengisti Taxi', '5724 - KYVI Cab Inc', '585 - 88805 Valley Cab Co', '585 - Valley Cab Co', '5864 - 73614 Thomas Owusu', '5864 - Thomas Owusu', '5874 - 73628 Sergey Cab Corp.', '5874 - Sergey Cab Corp.', '5997 - 65283 AW Services Inc.', '5997 - AW Services Inc.', '6057 - 24657 Richard Addo', '6488 - 83287 Zuha Taxi', '6574 - Babylon Express Inc.', '6742 - 83735 Tasha ride inc', '6743 - Luhak Corp', 'Blue Ribbon Taxi Association Inc.', 'C & D Cab Co Inc', 'Chicago Elite Cab Corp.', 'Chicago Elite Cab Corp. (Chicago Carriag', 'Chicago Medallion Leasing INC', 'Chicago Medallion Management', 'Choice Taxi Association', 'Dispatch Taxi Affiliation', 'KOAM Taxi Association', 'Northwest Management LLC', 'Taxi Affiliation Services', 'Top Cab Affiliation'"


## Checks data for errors

In [13]:
anomalies = tfdv.validate_statistics(statistics=stats, schema=schema)

Visualize anomaly data.

In [14]:
tfdv.display_anomalies(anomalies=anomalies)

In [15]:
anomalies.anomaly_info

{}

If there is an anomaly, you can add another allowed value into the schema.

In [16]:
tfdv.get_domain(schema, "payment_type").value.append("new_value")

In [17]:
tfdv.get_domain(schema, "payment_type").value

['Cash', 'Credit Card', 'Dispute', 'No Charge', 'Pcard', 'Prcard', 'Unknown', 'new_value']

Validate per-example data. It may cost lots of time.

In [16]:
options = tfdv.StatsOptions(schema=schema)

In [None]:
anomalous_example_stats = tfdv.validate_examples_in_csv(
    data_location="/Users/jiankaiwang/devops/tfx_taxi/taxi/data/simple/data.csv", 
    stats_options=options)

Validate instances for checking an individual example exhibiting anomalies.

## Schema Environments

## Checking data skew and drift

* Skew: between training and serving datasets
* Drift: among training datasets

In [21]:
serving_stats = tfdv.generate_statistics_from_csv(
    data_location="/Users/jiankaiwang/devops/tfx_taxi/taxi/data/simple/data_serving.csv")

In [26]:
tfdv.get_feature(schema, "payment_type").skew_comparator.infinity_norm.threshold = 0.01

In [28]:
tfdv.get_feature(schema, "payment_type")

name: "payment_type"
type: BYTES
domain: "payment_type"
presence {
  min_fraction: 1.0
  min_count: 1
}
skew_comparator {
  infinity_norm {
    threshold: 0.01
  }
}
shape {
  dim {
    size: 1
  }
}

In [30]:
skew_anomalies = tfdv.validate_statistics(statistics=stats, schema=schema, serving_statistics=serving_stats)

In [32]:
tfdv.display_anomalies(skew_anomalies)