# Developing dataset schema

In [1]:
import os
import sys
import logging

import tensorflow as tf
import tensorflow_data_validation as tfdv

from tensorflow_metadata.proto.v0 import schema_pb2, statistics_pb2, anomalies_pb2
from google.cloud import bigquery

In [2]:
PROJECT = 'jk-vertex-workshop'
REGION = 'us-central1'
PREFIX = 'jkvw'

STAGING_BUCKET = f'gs://{PREFIX}-bucket'

BQ_DATASET_NAME = f'{PREFIX}_dataset' 
BQ_TRAIN_SPLIT_NAME = 'training'
BQ_VALID_SPLIT_NAME = 'validation'
BQ_TEST_SPLIT_NAME = 'testing'
BQ_LOCATION = 'US'

## Generate Raw Data Schema

### Load a sample of the training split

In [3]:
client = bigquery.Client()

sql_script = f'''
SELECT * 
FROM {PROJECT}.{BQ_DATASET_NAME}.{BQ_TRAIN_SPLIT_NAME} 
'''
df = client.query(sql_script).result().to_dataframe()

In [4]:
df.head().T

Unnamed: 0,0,1,2,3,4
trip_month,2,2,2,2,2
trip_day,1,1,1,1,1
trip_day_of_week,7,7,7,7,7
trip_hour,13,16,21,15,15
trip_seconds,89,480,1500,518,124
trip_miles,0.07,1.2,1.4,0.8,0.39
payment_type,Cash,Cash,Cash,Cash,Cash
pickup_grid,POINT(-87.6 41.9),POINT(-87.7 41.9),POINT(-87.7 41.8),POINT(-87.7 42),POINT(-87.6 41.9)
dropoff_grid,POINT(-87.6 41.9),POINT(-87.7 41.9),POINT(-87.7 41.8),POINT(-87.7 42),POINT(-87.6 41.9)
euclidean,0.0,0.0,0.0,0.0,0.0


### Generate statistics

In [5]:
stats = tfdv.generate_statistics_from_dataframe(
    dataframe=df,
    stats_options=tfdv.StatsOptions(
        label_feature='tip_bin',
        weight_feature=None,
        sample_rate=1,
        num_top_values=50
    )
)

In [6]:
tfdv.visualize_statistics(stats)

### Generate schema

In [7]:
schema = tfdv.infer_schema(statistics=stats)
tfdv.display_schema(schema=schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'trip_month',INT,required,,-
'trip_day',INT,required,,-
'trip_day_of_week',INT,required,,-
'trip_hour',INT,required,,-
'trip_seconds',INT,required,,-
'trip_miles',FLOAT,required,,-
'payment_type',STRING,required,,'payment_type'
'pickup_grid',STRING,required,,'pickup_grid'
'dropoff_grid',STRING,required,,'dropoff_grid'
'euclidean',FLOAT,required,,-


### Update the schema

In [8]:
tfdv.set_domain(schema, 'trip_month', schema_pb2.IntDomain(name='trip_month', min=1, max=12, is_categorical=True))
tfdv.set_domain(schema, 'trip_day', schema_pb2.IntDomain(name='trip_day', min=1, max=31, is_categorical=True))
tfdv.set_domain(schema, 'trip_day_of_week', schema_pb2.IntDomain(name='trip_day_of_week', min=1, max=7, is_categorical=True))
tfdv.set_domain(schema, 'trip_hour', schema_pb2.IntDomain(name='trip_hour', min=0, max=23, is_categorical=True))
tfdv.set_domain(schema, 'tip_bin', schema_pb2.IntDomain(name='tip_bin', min=0, max=1, is_categorical=True))



2
4
2
4
2
4
2
4
2
4


In [9]:
tfdv.display_schema(schema=schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'trip_month',INT,required,,"[1,12]"
'trip_day',INT,required,,"[1,31]"
'trip_day_of_week',INT,required,,"[1,7]"
'trip_hour',INT,required,,"[0,23]"
'trip_seconds',INT,required,,-
'trip_miles',FLOAT,required,,-
'payment_type',STRING,required,,'payment_type'
'pickup_grid',STRING,required,,'pickup_grid'
'dropoff_grid',STRING,required,,'dropoff_grid'
'euclidean',FLOAT,required,,-


In [10]:
tfdv.get_feature(schema, 'tip_bin').annotation.tag.append('target')

In [11]:
schema

feature {
  name: "trip_month"
  type: INT
  int_domain {
    name: "trip_month"
    min: 1
    max: 12
    is_categorical: true
  }
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "trip_day"
  type: INT
  int_domain {
    name: "trip_day"
    min: 1
    max: 31
    is_categorical: true
  }
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "trip_day_of_week"
  type: INT
  int_domain {
    name: "trip_day_of_week"
    min: 1
    max: 7
    is_categorical: true
  }
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "trip_hour"
  type: INT
  int_domain {
    name: "trip_hour"
    min: 0
    max: 23
    is_categorical: true
  }
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "trip_seconds"
  type: INT
  presence

### Save the updated schema

In [14]:
schema_dir = os.path.join(STAGING_BUCKET, 'schema')
tf.io.gfile.makedirs(schema_dir)
schema_file = os.path.join(schema_dir, 'schema.pbtxt')

tfdv.write_schema_text(schema, schema_file)
tfdv.write_schema_text(schema, 'schema.pbtxt')

## Load and display the schema

In [15]:
schema = tfdv.load_schema_text(schema_file)

In [16]:
tfdv.display_schema(schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'trip_month',INT,required,,"[1,12]"
'trip_day',INT,required,,"[1,31]"
'trip_day_of_week',INT,required,,"[1,7]"
'trip_hour',INT,required,,"[0,23]"
'trip_seconds',INT,required,,-
'trip_miles',FLOAT,required,,-
'payment_type',STRING,required,,'payment_type'
'pickup_grid',STRING,required,,'pickup_grid'
'dropoff_grid',STRING,required,,'dropoff_grid'
'euclidean',FLOAT,required,,-
