In [None]:
%%bash

mkdir -p ../data/dataset1
mkdir -p ../data/dataset2

python3 ../utils/download_dataset.py
python3 ../utils/convert_to_tfrecords.py

# Content of the Table

>- [Data Ingestion](#Data-Ingestion)
>-[What is artifact](#What-is-artifact?)
>-[what metadata store is for?](#what-metadata-store-is-for?)
>>- [atrifacts Tables](#atrifacts-Tables)
>>- [Contexts Tables](#Contexts-Tables)
>>- [Executions Tables](#Executions-Tables)
>- [Loding dataset from tf_records](#Loding-dataset-from-tf_records)
>-[Configuration Options](#Configuration-Options)
>>- [splitting](#splitting)
>>- [If data is stored in spitted manner](#If-data-is-stored-in-spitted-manner)
>>- [Span](#Span)
>-[Add-ons](#Add-ons)

## Data Ingestion

In [47]:
import warnings
warnings.filterwarnings('ignore', 'absl')

In [None]:
import pprint
import os
import shutil
import pandas as pd
from collections import defaultdict

import tensorflow as tf

from tfx.components import CsvExampleGen
from tfx.utils.dsl_utils import external_input
from tfx.orchestration.experimental.interactive.interactive_context \
        import InteractiveContext

pp = pprint.PrettyPrinter()

In [None]:
from ml_metadata.metadata_store import metadata_store
from ml_metadata.proto import metadata_store_pb2

In [None]:
import datetime

id_ = str(datetime.datetime.now())
pipeline_name = f'pipline_{id_}'
base_root = os.path.split(os.getcwd())[0]
pipeline_root = os.path.join(base_root, f'temp_')
beam_args = [
    '--runner=DirectRunner'
]

if not os.path.exists(pipeline_root):
    os.makedirs(pipeline_root)


context = InteractiveContext(pipeline_name = pipeline_name,
                            pipeline_root = pipeline_root,
                            beam_pipeline_args = beam_args)

In [None]:
root_dir = os.path.split(os.getcwd())[0]
data_dir = os.path.join(root_dir, 'data', 'dataset1')

print(*os.listdir(data_dir), sep = '\n')

In [None]:
examples = external_input(data_dir)
example_gen = CsvExampleGen(input = examples)

In [None]:
context.run(example_gen)

In [None]:
example_gen_prop = example_gen.outputs['examples'].get()[0]

print('Artifact Location: ')
print(f'\t {example_gen_prop.uri}')
print()

print('Files: ')
print('\t train')
print(f'\t\t {os.listdir(os.path.join(example_gen_prop.uri, "train"))}')
print('\t eval')
print(f'\t\t {os.listdir(os.path.join(example_gen_prop.uri, "eval"))}')

### What is artifact?

In [None]:
split_names = eval(example_gen_prop.split_names)
artifact = os.path.join(example_gen_prop.uri, split_names[0])
files = [os.path.join(artifact, i) for i in os.listdir(artifact)]

train = tf.data.TFRecordDataset(filenames = files, compression_type = 'GZIP')

In [None]:
for data in train.take(1):
    serialized_example = data.numpy()
    example = tf.train.Example()
    example.ParseFromString(serialized_example)
    pp.pprint(example)

### what metadata store is for?

In [None]:
connection_config = context.metadata_connection_config
store = metadata_store.MetadataStore(connection_config)

base_dir = connection_config.sqlite.filename_uri.split('metadata.sqlite')[0]

In [None]:
def display_properties(input):
    data = defaultdict(list)
    for artifact in input:
        properties = artifact.properties
        custom_properties = artifact.custom_properties
        for key, value in properties.items():
            data['artifact id'].append(artifact.id)
            data['type_id'].append(artifact.type_id)
            data['name'].append(key)
            data['is_customproperty'].append(0)
            data['value'].append(value.string_value)

            
        for key, value in custom_properties.items():
            data['artifact id'].append(artifact.id)
            data['type_id'].append(artifact.type_id)
            data['name'].append(key)
            data['is_customproperty'].append(1)
            data['value'].append(value.string_value)
    return pd.DataFrame(data)


def display_types(types):
    table = {'id': [], 'name': []}
    for a_type in types:
        table['id'].append(a_type.id)
        table['name'].append(a_type.name.split('.')[-1])
    return pd.DataFrame(data=table)

def display_artifacts(store, artifacts):
    table = defaultdict(list)
    for a in artifacts:
        table['artifact id'].append(a.id)
        artifact_type = store.get_artifact_types_by_id([a.type_id])[0]
        table['type'].append(artifact_type.name)
        table['uri'].append(a.uri.replace(base_dir, './'))
        table['create_time_since_epoch'].append(a.create_time_since_epoch)
        table['last_update_time_since_epoch'].append(a.last_update_time_since_epoch)
    return pd.DataFrame(data=table)

In [None]:
def display_context(store, artifacts):
    table = defaultdict(list)
    for a in artifacts:
        table['artifact id'].append(a.id)
        artifact_type = store.get_context_types_by_id([a.type_id])[0]
        table['type'].append(artifact_type.name)
        table['name'].append(a.name)
        table['create_time_since_epoch'].append(a.create_time_since_epoch)
        table['last_update_time_since_epoch'].append(a.last_update_time_since_epoch)
    return pd.DataFrame(data=table)

def display_executions(store, artifacts):
    table = defaultdict(list)
    for a in artifacts:
        table['artifact id'].append(a.id)
        artifact_type = store.get_execution_types_by_id([a.type_id])[0]
        table['type'].append(artifact_type.name.split('.')[-1])
        e_state = a.last_known_state
        if e_state == 2:
            table['last_known_state'].append('Running')
        elif e_state == 3:
            table['last_known_state'].append('Success')
        else:
            table['last_known_state'].append(e_state)
        table['create_time_since_epoch'].append(a.create_time_since_epoch)
        table['last_update_time_since_epoch'].append(a.last_update_time_since_epoch)
    return pd.DataFrame(data=table)

#### atrifacts Tables

In [None]:
display_artifacts(store, store.get_artifacts())

In [None]:
display_types(store.get_artifact_types())

In [None]:
display_properties(store.get_artifacts())

#### Contexts Tables

In [None]:
display_context(store, store.get_contexts())

In [None]:
display_types(store.get_context_types())

In [None]:
display_properties(store.get_contexts())

#### Executions Tables

In [None]:
display_executions(store, store.get_executions())

In [None]:
display_properties(store.get_executions())

In [None]:
display_types(store.get_execution_types())

## Loding dataset from tf_records

In [None]:
from tfx.components import ImportExampleGen

root_dir = os.path.split(os.getcwd())[0]
data_dir = os.path.join(root_dir, 'data', 'dataset2')

print(*os.listdir(data_dir), sep = '\n')

In [None]:
examples = external_input(data_dir)
example_gen = ImportExampleGen(input=examples)
context.run(example_gen)

In [None]:
display_executions(store, store.get_executions())

In [None]:
display_properties(store.get_artifacts())

## Configuration Options

### splitting

In [None]:
from tfx.proto import example_gen_pb2

Configuring output as train, test and eval with 6:2:2 ration

In [None]:
try:
    data_dir = os.path.join(os.pardir, "data/dataset")

    output = example_gen_pb2.Output(
        split_config=example_gen_pb2.SplitConfig(splits=[
        example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=6), 
        example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=2), 
        example_gen_pb2.SplitConfig.Split(name='test', hash_buckets=2)]
                                                ))

    examples = external_input(data_dir)
    example_gen = CsvExampleGen(input=examples, output_config=output)
    context.run(example_gen)
except:
    data_dir = os.path.join(os.pardir, "data/dataset1")

    output = example_gen_pb2.Output(
        split_config=example_gen_pb2.SplitConfig(splits=[
        example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=6), 
        example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=2), 
        example_gen_pb2.SplitConfig.Split(name='test', hash_buckets=2)]
                                                ))

    examples = external_input(data_dir)
    example_gen = CsvExampleGen(input=examples, output_config=output)
    context.run(example_gen)

In [None]:
%%bash

tree ../temp_

In [None]:
def highlight(s):
    if s.last_known_state == 'Running':
        return ['background-color: red']*5
    else:
        return ['background-color: white']*5

execution = display_executions(store, store.get_executions())
execution.style.apply(highlight, axis = 1)

In [None]:
artifact_id = execution['artifact id'].loc[execution.last_known_state == 'Running'].values[0]

def highlight(s):
    if s['artifact id'] == artifact_id:
        return ['background-color: lightblue']*5
    elif s['artifact id'] == artifact_id + 1:
        return ['background-color: lightgreen']*5
    else:
        return ['background-color: white']*5

execution_prop = display_properties(store.get_executions())
execution_prop = execution_prop.loc[(execution_prop['artifact id'] == artifact_id) | (execution_prop['artifact id'] == artifact_id+1)].sort_values(by=['name','artifact id'])
execution_prop.style.apply(highlight, axis = 1)

### If data is stored in spitted manner

In [None]:
example_gen_prop = example_gen.outputs['examples'].get()[0]

shutil.copytree(example_gen_prop.uri, '../data/dataset3')

In [None]:
from tfx.proto import example_gen_pb2

root_dir = os.path.split(os.getcwd())[0]
data_dir = os.path.join(root_dir, 'data', 'dataset3')

input = example_gen_pb2.Input(splits=[
example_gen_pb2.Input.Split(name='train', pattern='train/*'),
example_gen_pb2.Input.Split(name='eval', pattern='eval/*'),
example_gen_pb2.Input.Split(name='test', pattern='test/*')
])

examples = external_input(os.path.join(base_dir, data_dir))
example_gen = ImportExampleGen(input=examples, input_config=input)
context.run(example_gen)

In [None]:
execution_property = display_properties(store.get_executions())
execution_property.loc[execution_property['artifact id'] == max(execution_property['artifact id'])]

### Span

In [None]:
%%bash

mkdir -p ../data/dataset4/export-0
mkdir -p ../data/dataset4/export-1
mkdir -p ../data/dataset4/export-2

file_l_count=$(wc -l < ../data/dataset1/consumer_complaints_with_narrative.csv)
head -n $(( file_l_count/3 )) ../data/dataset1/consumer_complaints_with_narrative.csv >> ../data/dataset4/export-0/consumer_complaints_with_narrative_$(( file_l_count/3 )).csv
head -n $(( file_l_count/2)) ../data/dataset1/consumer_complaints_with_narrative.csv >> ../data/dataset4/export-1/consumer_complaints_with_narrative_$(( file_l_count/2 )).csv
cp ../data/dataset1/consumer_complaints_with_narrative.csv ../data/dataset4/export-2/consumer_complaints_with_narrative_$file_l_count.csv

tree ../data/dataset4

In [None]:
base_dir = os.path.split(os.getcwd())[0]
data_dir = os.path.join(base_dir, "data", "dataset4")


input = example_gen_pb2.Input(splits=[
example_gen_pb2.Input.Split(pattern='export-{SPAN}/*')
])
examples = external_input(data_dir)
example_gen = CsvExampleGen(input=examples, input_config=input)
context.run(example_gen)

In [None]:
execution_prperties = display_properties(store.get_executions())
temp_val = execution_prperties.loc[(execution_prperties['name'] == 'input_base') | 
                         (execution_prperties['name'] == 'span')]
temp_val = temp_val.reset_index()
temp_val.drop('index', axis = 1, inplace = True)
temp_val = temp_val.sort_values(['artifact id', 'name'])

In [None]:
temp_val.style.highlight_max(subset = ['value'],
                       color = 'lightgreen', axis = 0)


## Add-ons

### Ingesting Data from avro or parquest file format

#### from Avro-serialized data

```
from tfx.components import FileBasedExampleGen
from tfx.components.example_gen.custom_executors import avro_executor
from tfx.utils.dsl_utils import external_input
examples = external_input(avro_dir_path)

example_gen = FileBasedExampleGen(
    input=examples,
    executor_class=avro_executor.Executor)
```

####  from Parquet-serialized data

```
from tfx.components.example_gen.custom_executors import parquet_executor
example_gen = FileBasedExampleGen(
input=examples,
executor_class=parquet_executor.Executor)
```

### Ingesting data from Data Base

#### from bigquery database
```
from tfx.components import BigQueryExampleGen
query = """
SELECT * FROM `<project_id>.<database>.<table_name>`
"""
example_gen = BigQueryExampleGen(query=query)
```



>Note:
            In TFX versions greater than 0.22.0, the BigQueryExampleGen
            component needs to be imported from tfx.extensions.goo
            gle_cloud_big_query :
>```
from tfx.extensions.google_cloud_big_query.example_gen import component as big_query_example_gen_component
big_query_example_gen_component.BigQueryExampleGen(query=query)
>```

#### from presto database
```
from proto import presto_config_pb2
from presto_component.component import PrestoExampleGen

query = """
SELECT * FROM `<project_id>.<database>.<table_name>`
"""
presto_config = presto_config_pb2.PrestoConnConfig(
host='localhost',
port=8080)
example_gen = PrestoExampleGen(presto_config, query=query)
```