# Tensorflow data validation example - UP

In [1]:
from tensorflow_data_validation import GenerateStatistics, TFExampleDecoder
from tensorflow_metadata.proto.v0 import statistics_pb2
import logging

import apache_beam as beam
import tensorflow_data_validation as tfdv
from apache_beam.options.pipeline_options import PipelineOptions, GoogleCloudOptions, StandardOptions, WorkerOptions, DebugOptions, SetupOptions
from tensorflow_data_validation import statistics
from spotify_tensorflow.tfx.tfdv import TfDataValidator
from spotify_tensorflow.tfx.utils import create_setup_file
from tensorflow_data_validation import StatsOptions

### Set up

In [2]:
options = PipelineOptions()
google_cloud_options = options.view_as(GoogleCloudOptions)
google_cloud_options.project = "users-protection"
google_cloud_options.job_name = "statsjob3"
google_cloud_options.staging_location = "gs://hchudgar_test/staging"
google_cloud_options.temp_location = "gs://hchudgar_test/temp"
options.view_as(StandardOptions).runner = 'DataflowRunner'

debug_options = options.view_as(DebugOptions)
debug_options.experiments = ['beam_fn_api']

# worker_options = options.view_as(WorkerOptions)
# worker_options.worker_harness_container_image = 'gcr.io/users-protection/test-hchudgar-tfdv'

setup_options = options.view_as(SetupOptions)
setup_file_path = create_setup_file()
setup_options.setup_file = setup_file_path

### Getting the statistics

#### Option 1: Generate stats from existing data

1. Specify where tfrecords are kept and where should the stats be stored
2. Generate

This takes almost an hour to compute

In [None]:
data_location = "gs://hchudgar_test/email_open/tf/examples/email_open.temp.BaseInputDataV1/2019-04-17/20190423T204516.878251-eee6280da13e/training/*.tfrecords"
output_path = "gs://hchudgar_test/email_open/tf/examples/email_open.temp.BaseInputDataV1/2019-04-17/stats/"

stats = tfdv.generate_statistics_from_tfrecord(data_location=data_location,
                                               output_path=output_path,
                                               stats_options=StatsOptions(),
                                               pipeline_options=options)


#### Option 2: Read from existing location

In [6]:
statsFilePath ="gs://slayton_test/email_open/tf/examples/email_open.BaseInputDataV1/2019-04-01/" + \
               "20190411T155410.476299-3e9102f6a419/training/_stats.pb"
stats = tfdv.load_statistics(input_path=statsFilePath)

### Visualize the stats of a single dataset

This function uses [Facets](https://pair-code.github.io/facets/) to visualize statistics
A quick look highlights features with many missing values, or features with many zeros along with distribution and some base statistics

In [7]:
tfdv.visualize_statistics(stats)

### Compare stats of two different datasets

We can compare statistics of two different datasets to either 
1. Compare if two days have different data distributions (e.g. email vs push features)
2. Compare distributions of training and a test dataset to identify data drift

In [10]:
rhs = tfdv.load_statistics(
    input_path="gs://slayton_test/email_open/tf/examples/email_open.BaseInputDataV1/2019-04-16/" +
               "20190417T184427.921839-7f41b7d82e6e/training/_stats.pb")

In [11]:
tfdv.visualize_statistics(lhs_name="2019-04-17", lhs_statistics=stats, rhs_name="2019-04-16", rhs_statistics=rhs)

### Schema generation from stats

In [13]:
sc = tfdv.infer_schema(statistics=rhs)

In [14]:
tfdv.display_schema(sc)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'userAggTable.days_since_last_dau',INT,optional,single,-
'emailLabelTable.hrs_to_open',INT,optional,single,-
'userAggTable.num_push_click_week',INT,optional,single,-
'userAggTable.email_click_open_rate_week',FLOAT,optional,single,-
'userAggTable.primary_platform',STRING,optional,single,'userAggTable.primary_platform'
'userAggTable.dsr',INT,optional,single,-
'userAggTable.num_in_app_month',INT,optional,single,-
'emailLabelTable.channel',STRING,required,,'emailLabelTable.channel'
'userAggTable.in_app_click_rate_yesterday',FLOAT,optional,single,-
'userAggTable.num_in_app_click_yesterday',INT,optional,single,-


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'userAggTable.primary_platform',"'android', 'cast', 'chrome', 'ios', 'linux', 'not_applicable', 'osx', 'other', 'playstation', 'tizen', 'unknown', 'windows', 'xbox'"
'emailLabelTable.channel','email'
'emailLabelTable.optout_type',"'New Music - Email', 'Playlist Updates - Email', 'Product News - Email', 'Recommended Music - Email', 'Spotify News and Offers (default) - Email'"
'userAggTable.product_type',"'ad-supported', 'non-subscriber', 'subscriber'"
'userAggTable.product_category',"'ad-supported', 'bundle', 'campaign', 'duo', 'family', 'iap', 'non-subscriber', 'standard', 'student', 'trial-opt-in', 'trial-opt-out'"
'userAggTable.country_code',"'AD', 'AE', 'AR', 'AT', 'AU', 'BE', 'BG', 'BH', 'BO', 'BR', 'CA', 'CH', 'CL', 'CO', 'CR', 'CY', 'CZ', 'DE', 'DK', 'DO', 'DZ', 'EC', 'EE', 'EG', 'ES', 'FI', 'FR', 'GB', 'GR', 'GT', 'HK', 'HN', 'HU', 'ID', 'IE', 'IL', 'IN', 'IS', 'IT', 'JO', 'JP', 'KW', 'LB', 'LI', 'LT', 'LU', 'LV', 'MA', 'MC', 'MT', 'MX', 'MY', 'NI', 'NL', 'NO', 'NZ', 'OM', 'PA', 'PE', 'PH', 'PL', 'PS', 'PT', 'PY', 'QA', 'RO', 'SA', 'SE', 'SG', 'SK', 'SV', 'TH', 'TN', 'TR', 'TW', 'US', 'UY', 'VN', 'ZA', 'other'"
'userAggTable.dsr_bucket',"'0 days ago', '1 - 6 days ago', '30 - 179 days ago', '7 - 29 days ago', '>179 days ago'"
'userAggTable.first_platform',"'android', 'ios', 'linux', 'none', 'osx', 'other', 'web', 'windows'"


### Data validation through schema

Schema inferred from training data can be used to validate test data

In [19]:
anomalies = tfdv.validate_statistics(stats, sc)

In [20]:
tfdv.display_anomalies(anomalies)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'\'emailLabelTable.optout_type\'',Unexpected string values,Examples contain values missing from the schema: Concert Notifications - Email (~32%).


Training and serving data can be compared to calculate drift

In [21]:
anomalies = tfdv.validate_statistics(schema=sc, statistics=rhs, serving_statistics=stats)

In [22]:
tfdv.display_anomalies(anomalies)

Custom thresholds can be set to warn if training and serving data are skewed

In [24]:
tfdv.get_feature(sc, 'userAggTable.days_since_last_dau').skew_comparator.infinity_norm.threshold = 0.01

Custom thresholds can be set to warn if current training data has diverged from when the model was initially developed

In [26]:
tfdv.get_feature(sc, 'userAggTable.days_since_last_dau').drift_comparator.infinity_norm.threshold = 0.01