In [None]:
import sys
import os

root_dir = os.getcwd().split(os.sep)[:-3]
root_dir = '/'.join(root_dir)
sys.path.append(root_dir)
from utils.helper_metastore import *
from utils.configurations.config import Config

In [None]:
import warnings
warnings.filterwarnings('ignore', 'absl')

%load_ext tensorboard

## Advance TensorFlow Transform

TensorFlow Transform is a library for preprocessing input data for TensorFlow, including creating features that require a full pass over the training dataset. TensorFlow has built-in support for manipulations on a single example or a batch of examples. tf.Transform extends these capabilities to support full passes over the entire training dataset.

The output of tf.Transform is exported as a TensorFlow graph which we can use for both training and serving. Using the same graph for both training and serving can prevent skew, since the same transformations are applied in both stages.

In this notebook, we are going to pick a scenario were we are going to read the data from the disk and define schema of the given dataset to do transformation using apache mean and tf transform.

Follwing fingure shows the components of our transformation pipeline

![fingure_3](../../image/data_preprocessing_fig_3.png)

In [None]:
import tensorflow as tf
import tensorflow_transform as tft
import tensorflow_transform.beam as tft_beam
import apache_beam as beam
from tensorflow_transform.tf_metadata.dataset_metadata import DatasetMetadata
from tensorflow_transform.tf_metadata.schema_utils import schema_from_feature_spec

from tfx_bsl.public import tfxio

Here we are going to use the titanic dataset

In [None]:
import pandas as pd

extention = '.csv'
train_datafile = os.path.join(root_dir, Config.ADD_ONS_DATASET_PATH
                             , Config.ADD_ONS_DATASET_NAME +extention)

OUPUT_PATH = os.path.join(root_dir, Config.ADD_ONS_DATASET_PATH, 
                          Config.ADD_ONS_DATASET_NAME + '.tfrecord')
raw_data = pd.read_csv(train_datafile, sep = '|')
raw_data.info()

In [None]:
# defining all columns and clump columns based on their data type
columns = list(raw_data.columns)
INT_COLUMNS = ['PassengerId', 'Survived', 'Pclass', 'SibSp', 'Parch']
FLOAT_COLUMNS = ['Age', 'Fare']
STRING_COLUMNS = ['Name', 'Sex', 'Ticket', 'Embarked']

# defining transform to the required columns
ONE_HOT_FEATURES = {'Sex': 2, 'Embarked': 4}
TEXT_FEATURES = {'Name' : None}
FLOAT_FEATURE = {'Age': True, 'Fare': True}
LABEL_KEY = 'Survived'

Let's define a schema based for the columns are in our input. Among other things this will help with importing them correctly.

In [None]:
# creating metadata for an dataset
meta_data = DatasetMetadata(
            schema_from_feature_spec(dict(
            [
                (name, tf.io.FixedLenFeature([], tf.string))
                for name in STRING_COLUMNS
            ] +
            [
                (name, tf.io.FixedLenFeature([], tf.int64))
                for name in INT_COLUMNS
            ] +
            [
                (name, tf.io.FixedLenFeature([], tf.float32))
                for name in FLOAT_COLUMNS
            ]+
            [(LABEL_KEY, tf.io.FixedLenFeature([], tf.int64))]
            ))
)

SCHEMA = meta_data.schema

Here we had defined some helper function and preprocessing function will is used to transform our raw data

In [None]:
def transformed_name(key):
    return key + '_xf'

# Some of our features are of a sparse nature, but TFT expects the transformation outputs
# to be dense
# Following function will be used to convert the sparse into dense tensor
def fill_in_missing(x):
    default_value = '' if x.dtype == tf.string else 0
    if type(x) == tf.SparseTensor:
        x = tf.sparse.to_dense(
            tf.SparseTensor(x.indices, x.values, [x.dense_shape[0], 1]),
            default_value)
    if len(x.shape) > 1:
        x = tf.squeeze(x, axis = 1)
    return x

def convert_num_to_one_hot(label_tensor, num_labels=2):
    one_hot_tensor = tf.one_hot(label_tensor, num_labels)
    return tf.reshape(one_hot_tensor, [-1, num_labels])


def preprocessing_fn(inputs):
    ONE_HOT_FEATURES = {'Sex': 2, 'Embarked': 4}
    TEXT_FEATURES = {'Name' : None}
    FLOAT_FEATURE = {'Age': True, 'Fare': True}
    LABEL_KEY = 'Survived'

    outputs = {}
    for key in ONE_HOT_FEATURES.keys():
        dim = ONE_HOT_FEATURES[key]
        index = tft.compute_and_apply_vocabulary(
                fill_in_missing(inputs[key]), top_k=dim + 1)
        outputs[transformed_name(key)] = convert_num_to_one_hot(
                index, num_labels=dim + 1)

    for key, to_norm in FLOAT_FEATURE.items():
        if to_norm:
            temp_feature = tft.scale_to_z_score(
                                fill_in_missing(inputs[key])
            )
        else:
            temp_feature = fill_in_missing(inputs[key])
        outputs[transformed_name(key)] = temp_feature
            
    for key in TEXT_FEATURES.keys():
        outputs[transformed_name(key)] = fill_in_missing(inputs[key])
    outputs[transformed_name(LABEL_KEY)] = fill_in_missing(inputs[LABEL_KEY])

    return outputs

In [None]:
base_dir = os.path.join(root_dir, Config.PIPELINE_FOLDER)

data_file = os.path.join(root_dir, 'data', 
                         'source_data', 'consumer_complaints_with_narrative.csv')

Now we're ready to start transforming our data in an Apache Beam pipeline.

- Read in the data using the CSV reader
- Transform it using a preprocessing pipeline that scales numeric data and converts categorical data from strings to int64 values indices, by creating a vocabulary for each category
- Write out the result as a TFRecord of Example protos, which we will use for training a model later

what code does?

1) **first of all we are going to read the data from the csv file using csv reader** <br>
     
Create a TFXIO to read the census data with the schema. To do this weneed to list all columns in order since the schema doesn't specify the order of columns in the csv. ```tfxio.CsvTFXIO``` can be used to both read the CSV files and parse them to TFT inputs<br><br>
 
what if you can't able to read data directly with ```tfxio.CsvTFXIO``` because of some extra spaces? <br>
&emsp;&emsp;The idea will be like, read CSV files as text using ```beam.io``` and then do some clensing to remove extra space and then pharse them to TFT using ```BeamRecordCsvTFXIO``` whose ```.BeamSource()``` accepts a PCollection[bytes] because we need to patch the records 

2) **Combine data and schema into a dataset tuple**<br>
&emsp;&emsp;```Note: we already used the schema to read the CSV data, but we also need it to interpret raw_data.```

3) **Apply transformation using ```tft_beam.AnalyzeAndTransformDataset```**

In [None]:
with beam.Pipeline() as p:
    with tft_beam.Context(temp_dir = base_dir):
        # read and pharse csv data
        csv_tfxio = tfxio.CsvTFXIO(
                    train_datafile, 
                    column_names = columns,
                    telemetry_descriptors = ['Demo'],
                    schema = SCHEMA,
                    delimiter = '|',
                    skip_header_lines = 1, skip_blank_lines= False)
        raw_data = (p |
                   'ReadTextFile' >> csv_tfxio.BeamSource()
                   )
        
        # combine raw data and meta_data as tuple
        raw_dataset = (raw_data, csv_tfxio.TensorAdapterConfig())
        
        # apply transfomation
        transformed_dataset, transform_fn = (
          raw_dataset | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))

        transformed_data, transformed_metadata = transformed_dataset
        
        # coder to encode the transformed data before writing it as tfrecords
        transformed_data_coder = tft.coders.ExampleProtoCoder(
          transformed_metadata.schema)
        
        _ = (
          transformed_data | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode)
                           | 'WriteTrainData' >> beam.io.WriteToTFRecord(OUPUT_PATH))

In [None]:
import os
artifact_path = os.path.join(root_dir, Config.PIPELINE_FOLDER, 'tftransform_tmp')

all_subdirs = [os.path.join(artifact_path,d) for d in os.listdir(artifact_path) if os.path.isdir(os.path.join(artifact_path,d))]

latest_subdir = max(all_subdirs, key=os.path.getmtime)
log_dir = os.path.join(root_dir, Config.TENSORBOARD_LOGGING)

Explore transform graph using tensorboard

In [None]:
from utils.tools import load_graph_to_tensorboard

load_graph_to_tensorboard(latest_subdir, log_dir)

In [None]:
%tensorboard --logdir {log_dir}