In [None]:
import sys
import os

root_dir = os.path.split(os.getcwd())[0]

sys.path.append(root_dir)
from utils.helper_metastore import *
from utils.configurations.config import Config

In [None]:
import warnings
warnings.filterwarnings('ignore', 'absl')

%load_ext tensorboard

## General Introduction

The data that we used to train our machine learning model is not always available in required format. 

>For eg, target variable in the dataset that we explored before is categorical (Yes or No) but machine learning model prefers only numeric data. 

Also we have to do some feature engineering in the input featuresto imporove the model accurac.
>For eg, Feature Crossing etc..

For all these process we need some function which has to apply all these transformation to the input features both on training and serving time.



As a Data Scientist, we mostly prefer python notebook with pandas or in Spark jobs to develop initial process like Data Exploration, creating preprocessing functions and model development. We should re-use the same preprocessing code to guarantee that a given raw input maps to the same feature vector at training and serving time. If this does not happen, we have training-serving skew.

Let imagine you had asigned to new ml project, you decided to use spark pipeline to process raw data. In the production server the preprocessing steps are
implemented in an API. Due to some limitations you are forced to re-implement all your preprocessing steps using numpy/pandas. Now you have to maintain two different implementation setups for preprocessing(spark and numpy/pandas). Given an input, you must ensure they are giving same output to avoid training-serving skew.
![Figure_1](image/data_preprocessing_fig_1.png)

With TFT, we can avoid a misalignment of the preprocessing steps.

## Why tensorflow transform?

Three main purpose of Tensorflow Transforms are:
- Preprocessing your data efficiently in the context of the entire dataset
- Scaling the preprocessing steps effectively
- Avoiding a potential training-serving skew

TFT processes the data that we ingested into our pipeline with the earlier generated
dataset schema, and it outputs two artifacts:
- Preprocessed training and evaluation datasets in the TFRecord format. The produced datasets can be consumed downstream in the Trainer component of our pipeline.
- Exported preprocessing graph (with assets), which will be used when we’ll export our machine learning model.

The key to TFT is the preprocessing_fn function. The function defines all transformations we want to apply to the raw data. When we execute the Transform component, the preprocessing_fn function will receive the raw data,
apply the transformation, and return the processed data. The data is provided as TensorFlow Tensors or SparseTensors (depending on the feature). All transformations applied to the tensors have to be TensorFlow operations. This allows TFT to effectively distribute the preprocessing steps.

TFT uses Apache Beam under the hood to execute preprocessing instructions. This allows us to distribute the preprocessing if needed on the Apache Beam backend of our choice. If you don’t have access to Google Cloud’s Dataflow product or an Apache Spark or Apache Flink cluster, Apache Beam will default back to its Direct
Runner mode. 

>Note:
To avoid a misalignment between the preprocessing steps and the trained model, the exported model can include the preprocessing graph and the trained model. We can then deploy the model like any other TensorFlow model, but during our inference, the data will be preprocessed on the model server as part of the model inference. This avoids the requirement that preprocessing happen on the client side and simplifies the development of clients.

## Defining Preprocessing function

TFT accepts preprocessing function and it will outputs the tf graph and transformed dataset.The preprocessing function must accepts and returns a dictionary of tensors. We can use two type of function to define preprocessing
function: 

- One will be the function which can accepts and returns tensors. These add operation to the tf graph that transform rat data into transformed data.
- Another will be the analyzer functions(tft.min, tft.scale_to_z_score, etc.,) provided by tf.Transform, it to accept and return tensors but unlike tensorflow functions, they won't be added as an operation to the graph. Instead they will compute a full pass operation outside of TensorFlow.They use the input tensor values over the entire dataset to generate a constant tensor that is returned as the output.

In [None]:
import pprint
import tensorflow as tf
import tensorflow_transform as tft
import tensorflow_transform.beam as tft_beam
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import schema_utils
from tfx.components import Transform
from ml_metadata.metadata_store import metadata_store
from ml_metadata.proto import metadata_store_pb2

base_dir = os.path.join(root_dir, Config.PIPELINE_FOLDER)
file = [i for i in os.listdir(base_dir) if 'sqlite' in i]
config = os.path.join(base_dir, file[0])

connection_config = metadata_store_pb2.ConnectionConfig()
connection_config.sqlite.filename_uri = config

store = metadata_store.MetadataStore(connection_config)

In [None]:
previous_execution_status_stat = get_latest_executions(store, Config.PIPELINE_NAME, 'StatisticsGen')
previous_execution_status_schema = get_latest_executions(store, Config.PIPELINE_NAME, 'SchemaGen')
if previous_execution_status_stat and previous_execution_status_schema:
    previous_execution_status_schema = previous_execution_status_schema[0].last_known_state
    previous_execution_status_stat = previous_execution_status_stat[0].last_known_state
else:
    raise Exception('[Exception] Run the Data Ingestion Notebook before Running this...') 
    
if  previous_execution_status_schema == 3 and previous_execution_status_stat == 3:
    print('[INFO] previous component Execution State is Success. You can Proceed Further now..')
elif previous_execution_status_schema == 2 and previous_execution_status_stat == 3:
    print('[Warning] SchemaGen Component Execution is in Running State')
elif previous_execution_status_schema == 3 and previous_execution_status_stat == 2:
    print('[Warning] StatisticsGen Component Execution is in Running State')
elif previous_execution_status_stat == 2 and previous_execution_status_schema == 2:
    print('[Warning] previous component Execution is in Running State')

In [None]:
def preprocessing_fn(inputs):
    x1 = inputs['x1']
    x2 = inputs['x2']
    x1_normalized = tft.scale_to_0_1(x1)
    x2_integerized = tft.compute_and_apply_vocabulary(x2)
    return {
      'x1_normalized': x1_normalized,
      'x2_integerized': x2_integerized
    }

```Preprocessing_fn``` will accept and return dictionary. Our dummy dataset will contain two variables 
> x1 => dtype: int; x2 => dtype: str

Our function will create four new tensors:
- The second new tensor, x1_normalized, is created in a similar manner but using the convenience method tft.scale_to_0_1. This method does something similar to computing x_centered, namely computing a maximum and minimum and using these to scale y.
- The tensor x2_integerized shows an example of string manipulation. In this case, we take a string and map it to an integer. This uses the convenience function tft.compute_and_apply_vocabulary. This function uses an analyzer to compute the unique values taken by the input strings, and then uses TensorFlow operations to convert the input strings to indices in the table of unique values.

The preprocessing function defines a pipeline of operations on a dataset. The typical workflow of a tf.Transform user will construct a preprocessing function, then incorporate this into a larger Beam pipeline, creating the data for training.


In [None]:
# dummy data

raw_data = [
    {'x1': 1, 'x2': 'hello'},
    {'x1': 2, 'x2': 'world'},
    {'x1': 3, 'x2': 'hello'}
]

### Data Formats and Schema

TFT Beam implementation accepts two different input data formats. The "instance dict" format is suitable for small datasets while the TFXIO format provides improved performance and is suitble for large datasets.

- If raw_data_metadata is a dataset_metadata.DatasetMetadata, then raw_data is expected to be in the "instance dict" format.
- If raw_data_metadata is a tfxio.TensorAdapterConfig, then raw_data is expected to be in the TFXIO format.

#### Instance dict fomat

The metadata contains the schema which used to defines the layout of the data so it can be read and written to various formats.

In [None]:
raw_data_metadata = dataset_metadata.DatasetMetadata(
    schema_utils.schema_from_feature_spec({
        'x1': tf.io.FixedLenFeature([], tf.float32),
        'x2': tf.io.FixedLenFeature([], tf.string),
    }))

The Schema proto contains the information needed to parse the data from its on-disk or in-memory format, into tensors. It is typically constructed by calling schema_utils.schema_from_feature_spec with a dict mapping feature keys to ```tf.io.FixedLenFeature```, ```tf.io.VarLenFeature``` and ```tf.io.SparseFeature``` values.

Here we used tf.io.FixedLenFeature to indicate that each feature contains a fixed number of values, in this case a single scalar value.

#### TFXIO format

With this format, the data is expected to be contained in a pyarrow.RecordBatch. For tabular data, our Apache Beam implementation accepts Arrow RecordBatches that consist of columns of the following types:

- pa.list_(<primitive>), where <primitive> is pa.int64(), pa.float32() pa.binary() or pa.large_binary().

- pa.large_list(<primitive>)

The toy input dataset we used above, when represented as a RecordBatch, looks like the following:

```raw_data = [
    pa.record_batch([
        pa.array([[1], [2], [3]], pa.list_(pa.float32())),
        pa.array([[1], [2], [3]], pa.list_(pa.float32())),
        pa.array([['hello'], ['world'], ['hello']], pa.list_(pa.binary())),
    ], ['x', 'y', 's'])
]
```
Similar to DatasetMetadata being needed to accompany the "instance dict" format, a tfxio.TensorAdapterConfig is needed to accompany the RecordBatches. It consists of the Arrow schema of the RecordBatches, and TensorRepresentations to uniquely determine how columns in RecordBatches can be interpreted as TensorFlow Tensors (including but not limited to tf.Tensor, tf.SparseTensor).

TensorRepresentations is a Dict[Text, TensorRepresentation] which establishes the relationship between a Tensor that preprocessing_fn accepts and columns in the RecordBatches. For example:


```tensor_representation = {
    'x': text_format.Parse(
        """dense_tensor { column_name: "col1" shape { dim { size: 2 } } }"""
        schema_pb2.TensorRepresentation())
}
```
Means that inputs['x'] in preprocessing_fn should be a dense tf.Tensor, whose values come from a column of name 'col1' in the input RecordBatches, and its (batched) shape should be [batch_size, 2].

TensorRepresentation is a Protobuf 

### Executing preprocessing function

The preprocessing function is an logical description of a preprocessing pipeline implemented on multiple data processing frameworks, tf.Transform provides a canonical implementation used on Apache Beam. This implementation demonstrates the functionality required from an implementation.

In TXF, analysis step will extract constant values from data and the transform step will use those constant values to make calculations with batches of data’.


![figure_2](image/data_preprocessing_fig_2.png)

In [None]:
with tft_beam.Context(temp_dir=base_dir):
    transformed_dataset, transform_fn = (
        (raw_data, raw_data_metadata) | tft_beam.AnalyzeAndTransformDataset(
            preprocessing_fn))

    transformed_data, transformed_metadata = transformed_dataset

what if we need to read data from the disc?

Apache Beam provides functions to handle file ingestions effectively (e.g., with beam.io.Read
FromText() or beam.io.ReadFromTFRecord() ) in the context of building TensorFlow
models.

As you can see Apache Beam executions can get complex quickly, and the data scientists and machine learning engineers aren’t in the business of writing execution instructions from scratch. This is why TFX is so handy. It abstracts all the instructions under the hood and lets the data scientist focus on their problem-specific setups like defining the preprocessing_fn() function.



In [None]:
print('\nRaw data:\n{}\n'.format(pprint.pformat(raw_data)))
print('Transformed data:\n{}'.format(pprint.pformat(transformed_data)))

#### What are artifacts are created?

- transformed_dataset
- transformed_metadata
- transform_fn

```transform_fn``` is a pure function that represents an operation that is applied to each row of the dataset. In particular, the analyzer values are already computed and treated as constants.

In [None]:
%%bash
tree ../temp_/tftransform_tmp

In [None]:
transform_fn

#### how the transform graph looks like?

In [None]:
from tensorflow.python.client import session
from tensorflow.python.framework import importer
from tensorflow.python.framework import ops
from tensorflow.python.summary import summary
from tensorflow.python.tools import saved_model_utils

try:
    from tensorflow.contrib.tensorrt.ops.gen_trt_engine_op import *
except ImportError:
    pass

In [None]:
from utils.tools import load_graph_to_tensorboard

In [None]:
model_dir = os.path.join(transform_fn[0][0])
log_dir = os.path.join(root_dir, Config.TENSORBOARD_LOGGING)
log_dir

In [None]:
load_graph_to_tensorboard(model_dir, log_dir)

In [None]:
%tensorboard --logdir {log_dir}

Try to explore what are the nodes are added into the graph

### Integrate TFT into Your Machine Learning Pipeline

Earlier, we had investigated the dataset and determined which features are categorical or numerical and we had explored the dataset using DataValidation Framework. This
information is crucial for defining our feature engineering.

In the following code, We are going to define a function which will apply transformation in the given features. Those newly generated feature are going to be used for model training.

In [None]:
LABEL_KEY = "consumer_disputed"

# Feature name, feature dimensionality.
ONE_HOT_FEATURES = {
"product": 11,
"sub_product": 45,
"company_response": 5,
"state": 60,
"issue": 90
}

# Feature name, bucket count.
BUCKET_FEATURES = {
"zip_code": 10
}

# Feature name, value is unused.
TEXT_FEATURES = {
"consumer_complaint_narrative": None
}

#### Prefrocessing_func

With all the helper functions in place, we can now loop over each feature column and
transform it depending on the type. For example, for our features to be converted to
one-hot features, we convert the category names to an index with tft.com
pute_and_apply_vocabulary() and then convert the index to a one-hot vector rep‐
resentation with our helper function convert_num_to_one_hot() . Since we are using
tft.compute_and_apply_vocabulary() , TensorFlow Transform will first loop over
all categories and then determine a complete category to index mapping

In [None]:
%%writefile {os.path.join(root_dir, Config.TRANSFORM_MODULE_SCRIPT)}

import tensorflow as tf
import tensorflow_transform as tft

LABEL_KEY = "consumer_disputed"

# Feature name, feature dimensionality.
ONE_HOT_FEATURES = {
"product": 11,
"sub_product": 45,
"company_response": 5,
"state": 60,
"issue": 90
}

# Feature name, bucket count.
BUCKET_FEATURES = {
"zip_code": 10
}

# Feature name, value is unused.
TEXT_FEATURES = {
"consumer_complaint_narrative": None
}

# It is a good practice to rename the features by appending a suffix to the feature name
def transformed_name(key):
    return key + '_xf'

# Some of our features are of a sparse nature, but TFT expects the transformation outputs
# to be dense
# Following function will be used to convert the sparse into dense tensor
def fill_in_missing(x):
    default_value = '' if x.dtype == tf.string else 0
    if type(x) == tf.SparseTensor:
        x = tf.sparse.to_dense(
            tf.SparseTensor(x.indices, x.values, [x.dense_shape[0], 1]),
            default_value)
    if len(x.shape) > 1:
        x = tf.squeeze(x, axis = 1)
    return x

def convert_num_to_one_hot(label_tensor, num_labels=2):
    one_hot_tensor = tf.one_hot(label_tensor, num_labels)
    return tf.reshape(one_hot_tensor, [-1, num_labels])



def convert_zip_code(zip_code):
    if zip_code == '':
        zip_code = "00000"
    else:
        zip_code = tf.strings.regex_replace(zip_code, 'X{0,5}', "0")
    return tf.strings.to_number(zip_code, out_type=tf.dtypes.int64)


def preprocessing_fn(inputs):
    LABEL_KEY = "consumer_disputed"

    # Feature name, feature dimensionality.
    ONE_HOT_FEATURES = {
    "product": 11,
    "sub_product": 45,
    "company_response": 5,
    "state": 60,
    "issue": 90
    }

    # Feature name, bucket count.
    BUCKET_FEATURES = {
    "zip_code": 10
    }

    # Feature name, value is unused.
    TEXT_FEATURES = {
    "consumer_complaint_narrative": None
    }
    outputs = {}
    for key in ONE_HOT_FEATURES.keys():
        dim = ONE_HOT_FEATURES[key]
        index = tft.compute_and_apply_vocabulary(
                fill_in_missing(inputs[key]), top_k=dim + 1)
        outputs[transformed_name(key)] = convert_num_to_one_hot(
                index, num_labels=dim + 1)

    for key, bucket_count in BUCKET_FEATURES.items():
        temp_feature = tft.bucketize(
                convert_zip_code(fill_in_missing(inputs[key])),
                bucket_count,
                always_return_num_quantiles=False)
        outputs[transformed_name(key)] = convert_num_to_one_hot(
                temp_feature,
                num_labels=bucket_count + 1)
            
    for key in TEXT_FEATURES.keys():
        outputs[transformed_name(key)] = fill_in_missing(inputs[key])
    outputs[transformed_name(LABEL_KEY)] = fill_in_missing(inputs[LABEL_KEY])

    return outputs

#### Transform Component

In [None]:
from tfx.types import artifact_utils
from tfx.types import standard_artifacts
from tfx.types import channel_utils

from tfx.orchestration.experimental.interactive import visualizations

In [None]:
artifacts = get_latest_artifacts(store, Config,PIPELINE_NAME, ['CsvExampleGen', 'ImportExampleGen'])

example_gen = find_latest_artifacts_by_type(store, artifacts, standard_artifacts.Examples.TYPE_NAME)
example_gen = channel_utils.as_channel(example_gen)

In [None]:
artifacts = get_latest_artifacts(store, Config.PIPELINE_NAME, 'SchemaGen')
example_schema = find_latest_artifacts_by_type(store, artifacts, standard_artifacts.Schema.TYPE_NAME)
example_schema = channel_utils.as_channel(example_schema)

In [None]:
from tfx.components import Transform
from tfx.orchestration.experimental.interactive.interactive_context \
        import InteractiveContext

pipeline_name = Config.PIPELINE_NAME
base_root = os.path.split(os.getcwd())[0]
pipeline_root = os.path.join(base_root, f'temp_')
beam_args = [
    '--runner=DirectRunner'
]

if not os.path.exists(pipeline_root):
    raise Exception('Run Data Ingestion Notebook before running this')

context = InteractiveContext(pipeline_name = pipeline_name,
                            pipeline_root = pipeline_root,
                            beam_pipeline_args = beam_args)


root_dir = os.path.split(os.getcwd())[0]

transform = Transform(
    examples=example_gen,
    schema=example_schema,
    module_file=os.path.join(root_dir, Config.TRANSFORM_MODULE_SCRIPT)

context.run(transform)

In [None]:
transform_fn_graph = os.path.join(transform.outputs['transform_graph'].get()[0].uri,
                                 'transform_fn')

In [None]:
load_graph_to_tensorboard(transform_fn_graph, log_dir)

In [None]:
%tensorboard --logdir {log_dir}