In [24]:
import os
import tfx

print("TFX version: {}".format(tfx.__version__))

TFX version: 0.14.0


# ExampleGen

## How to use an ExampleGen component

### importing csv

In [1]:
from tfx.utils.dsl_utils import csv_input
from tfx.components.example_gen.csv_example_gen.component import CsvExampleGen

In [4]:
# variable, examples, points to the folder of example data
examples = csv_input("/Users/jiankaiwang/devops/tfx_taxi/taxi/data/simple/")
example_gen = CsvExampleGen(input_base=examples)

### importing tfrecord

In [5]:
from tfx.utils.dsl_utils import tfrecord_input
from tfx.components.example_gen.import_example_gen.component import ImportExampleGen

In [6]:
tfrecord_example = tfrecord_input("/Users/jiankaiwang/Google 雲端硬碟/public/document/201908_DL_ObjectDetection/tfrecords/")
tfrecord_example_gen = ImportExampleGen(tfrecord_example)

## Data Split

### Split dataset with ratio (while in output)

In [7]:
from tfx.proto import example_gen_pb2

Split the dataset into train and eval subdatasets in ratio 3:1.

In [16]:
output = example_gen_pb2.Output(split_config=example_gen_pb2.SplitConfig(splits=[
    example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=3),
    example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1)
]))

In [17]:
example_gen_split = CsvExampleGen(input_base=examples, output_config=output)

### Load the split dataset (while in input)

Notice there is a `*` in declaring patterns.

For the file-based retrieval system (like CsvExampleGen or ImportExampleGen), the pattern is the relative path to the input_base. For the query-based system like BigQuery (e.g. BigQueryExampleGen, PrestoExampleGen), the pattern is the SQL query.

By default, the input is regarded as one source input and the ratio between train and eval is 2:1.

In [18]:
inputs = example_gen_pb2.Input(splits=[
    example_gen_pb2.Input.Split(name="train", pattern="train/*"),
    example_gen_pb2.Input.Split(name="eval", pattern="eval/*")
])

In [19]:
example_load_split = CsvExampleGen(input_base=examples, input_config=inputs)

## Customized ExampleGen

The customized ExampleGen is inherited from BaseExampleGenExecutor, for example, extending from `FileBasedExampleGen` and `PrestoExampleGen`.

### Customized File-based ExampleGen

In [21]:
from tfx.components.base import executor_spec
from tfx.components.example_gen.component import FileBasedExampleGen
from tfx.components.example_gen.csv_example_gen import executor
from tfx.utils.dsl_utils import external_input

examples = external_input("/Users/jiankaiwang/devops/tfx_taxi/taxi/data/simple/")
example_gen = FileBasedExampleGen(
    input_base=examples,
    custom_executor_spec=executor_spec.ExecutorClassSpec(executor.Executor))

### Customized Query-based ExampleGen

In [None]:
from tfx.examples.custom_components.presto_example_gen.proto import presto_config_pb2
from tfx.examples.custom_components.presto_example_gen.presto_component.component import PrestoExampleGen

presto_config = presto_config_pb2.PrestoConnConfig(host='localhost', port=8080)
example_gen = PrestoExampleGen(presto_config, query='SELECT * FROM chicago_taxi_trips')