In [1]:
import tensorflow_data_validation as tfdv
print('TFDV version: {}'.format(tfdv.version.__version__))

TFDV version: 0.30.0


# Generate Statistic

In [3]:
# Generate and visualize statistic of a dataset 

my_train_stats = tfdv.generate_statistics_from_csv(data_location="train.csv")
tfdv.visualize_statistics(my_train_stats)

In [23]:
# Export statistic into file

tfdv.write_stats_text(my_train_stats, "my_train_stats.pbtext")
# f = open("my_train_stats.pbtext", "r")
# print(f.read())
# f.close()

 # Generate Schema
 

In [7]:
# Generate Schema

import warnings
warnings.filterwarnings('ignore')

my_schema = tfdv.infer_schema(statistics=my_train_stats)
tfdv.display_schema(schema=my_schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'race',STRING,required,,'race'
'risk_level',STRING,required,,'risk_level'
'id',INT,required,,-
'age',INT,required,,-
'salary',INT,required,,-


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'race',"'Chinese', 'Indian', 'Malay', 'Others'"
'risk_level',"'high', 'low', 'medium'"


In [22]:
tfdv.write_schema_text(my_schema, "train_schema.pbtext")

# p = open("train_schema.pbtext", "r")
# print(p.read())
# p.close()

# Check Anomalies 

In [10]:
# Validate anomalies 

# Import new dataset
my_train_missing_stats = tfdv.generate_statistics_from_csv(data_location="train-missing-field.csv")

# Validate
my_anomalies = tfdv.validate_statistics(statistics=my_train_missing_stats, schema=my_schema)
tfdv.display_anomalies(my_anomalies)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'risk_level',Column dropped,Column is completely missing


In [11]:
#Ignore validation error 

# All features are by default in both TRAINING and SERVING environments.
my_schema.default_environment.append('TRAINING')
my_schema.default_environment.append('SERVING')

# Specify that 'risk_lvel' feature is not in SERVING environment.
tfdv.get_feature(my_schema, 'risk_level').not_in_environment.append('SERVING')

serving_anomalies_with_env = tfdv.validate_statistics(statistics=my_train_missing_stats, 
                                                      schema=my_schema,
                                                      environment='SERVING')

tfdv.display_anomalies(serving_anomalies_with_env)

# Detect Data Skew

In [13]:
# Detect Data Skew

my_test_stats = tfdv.generate_statistics_from_csv(data_location="test.csv")


race=tfdv.get_feature(my_schema, 'race')
race.skew_comparator.infinity_norm.threshold = 0.1

my_skew_anomalies = tfdv.validate_statistics(my_train_stats, my_schema,
                                          serving_statistics=my_test_stats)

tfdv.display_anomalies(my_skew_anomalies)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'race',High Linfty distance between training and serving,"The Linfty distance between training and serving is 0.75 (up to six significant digits), above the threshold 0.1. The feature value with maximum difference is: Chinese"


In [15]:
tfdv.visualize_statistics(lhs_statistics=my_train_stats, rhs_statistics=my_test_stats,
                          lhs_name='Train_Dataset', rhs_name='Test_Dataset')

# Detect Data Drift

In [14]:
# Import new dataset
my_train2_stats = tfdv.generate_statistics_from_csv(data_location="train2.csv")

# Create new schema to prevent messing up with data skew example
my_schema2 = tfdv.infer_schema(statistics=my_train_stats)

risk_level=tfdv.get_feature(my_schema2, 'risk_level')
risk_level.drift_comparator.infinity_norm.threshold = 0.1

my_drift_anomalies = tfdv.validate_statistics(my_train_stats, my_schema2,
                                             previous_statistics=my_train2_stats,
                                             serving_statistics=my_test_stats)

tfdv.display_anomalies(my_drift_anomalies)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'risk_level',High Linfty distance between current and previous,"The Linfty distance between current and previous is 0.35 (up to six significant digits), above the threshold 0.1. The feature value with maximum difference is: low"


In [17]:
tfdv.visualize_statistics(lhs_statistics=my_train_stats, rhs_statistics=my_train2_stats,
                          lhs_name='Train_Dataset', rhs_name='Train2_Dataset')

In [18]:
tfdv.visualize_statistics(lhs_statistics=my_train2_stats, rhs_statistics=my_test_stats,
                          lhs_name='Train2_Dataset', rhs_name='Test_Dataset')