### Instalar las dependencias

In [1]:
!pip install tensorflow-transform



### Importar las librerías

In [2]:
import tempfile # Archivos temporales
import pandas as pd
import tensorflow as tf
import tensorflow_transform as tft
import tensorflow_transform.beam.impl as tft_beam 

from __future__ import print_function
from tensorflow_transform.tf_metadata import dataset_metadata

### Pre Procesado de datos

In [3]:
dataset = pd.read_csv('pollution-small.csv')

In [4]:
dataset.head()

Unnamed: 0,Date,pm10,no2,so2,soot
0,1/1/2009,98.67,14.1,44.38,34.81
1,1/2/2009,52.33,14.1,29.75,33.06
2,1/3/2009,74.67,20.5,36.25,39.25
3,1/4/2009,72.0,17.3,46.44,34.38
4,1/5/2009,81.0,25.64,56.56,45.59


In [5]:
# Eliminar la columna de la fecha
features = dataset.drop('Date', axis = 1)

In [6]:
features.head()

Unnamed: 0,pm10,no2,so2,soot
0,98.67,14.1,44.38,34.81
1,52.33,14.1,29.75,33.06
2,74.67,20.5,36.25,39.25
3,72.0,17.3,46.44,34.38
4,81.0,25.64,56.56,45.59


In [7]:
# Convertir el dataset a lista de diccionarios de python
dict_features = list(features.to_dict('index').values())

In [8]:
dict_features[:5]

[{'no2': 14.1, 'pm10': 98.67, 'so2': 44.38, 'soot': 34.81},
 {'no2': 14.1, 'pm10': 52.33, 'so2': 29.75, 'soot': 33.06},
 {'no2': 20.5, 'pm10': 74.67, 'so2': 36.25, 'soot': 39.25},
 {'no2': 17.3, 'pm10': 72.0, 'so2': 46.44, 'soot': 34.38},
 {'no2': 25.64, 'pm10': 81.0, 'so2': 56.56, 'soot': 45.59}]

#### Definir los metadatos del dataset

In [10]:
from tensorflow_transform.tf_metadata import schema_utils

In [12]:
data_metadata = dataset_metadata.DatasetMetadata(
    schema_utils.schema_from_feature_spec({
        'no2':  tf.io.FixedLenFeature([], tf.float32), # Longitud fija de tipo float
        'so2':  tf.io.FixedLenFeature([], tf.float32), # El shape [] indica que son números y no vectores
        'pm10': tf.io.FixedLenFeature([], tf.float32),
        'soot': tf.io.FixedLenFeature([], tf.float32),
    })
)

In [13]:
data_metadata

{'_schema': feature {
  name: "no2"
  type: FLOAT
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "pm10"
  type: FLOAT
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "so2"
  type: FLOAT
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "soot"
  type: FLOAT
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
}

#### Función preprocesing

In [14]:
def preprocessing_fn(inputs):

  no2 = inputs['no2']
  pm10 = inputs['pm10']
  so2 = inputs['so2']
  soot = inputs['soot']

  no2_normalized = no2 - tft.mean(no2)
  so2_normalized = no2 - tft.mean(so2)

  pm10_normalized = tft.scale_to_0_1(pm10)
  soot_normalized = tft.scale_by_min_max(soot)

  return {
      'no2_normalized': no2_normalized,
      'so2_normalized': so2_normalized,
      'pm10_normalized': pm10_normalized,
      'soot_normalized': soot_normalized
  }

TensorFlow Transform utiliza **Apache Beam** en segundo plano para llevar a cabo transformaciones de datos escalables. En esta función usaremos un ejecutor directo (direct runner)

Argumentos para el ejecutor

* `dict_features` - Nuestro dataset convertido a diccionario de python
* `data_metadata` - Los meta datos de nuestro dataset que hemos creado
* `preprocessing_fn` - La función de pre procesado principal. Se llamará para aplicar la operacion de pre procesado columna a columna

La sintaxis de Apache Beam es especial. Se utiliza para apilar operaciones e invocar transformaciones en nuestros datos en forma de _pipe_.

En nuestro caso sería
* **resultado** -> transformed_dataset, transform_fn
* **dato_a_pasar** -> (dict_features, data_metadata)
* **donde_pasar_el_dato** -> tft_beam.AnalyzeAndTransformDataset(preprocessing_fn)

transformed_dataset, transform_fn = ((dict_features, data_metadata ) | 
tft_beam.AnalyzeAndTransformDataset((preprocessing_fn))

In [16]:
def data_transform():

  with tft_beam.Context( temp_dir = tempfile.mkdtemp()):
    transformed_dataset, transform_fn = ((dict_features, data_metadata ) | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))

  transformed_data, transformed_metadata = transformed_dataset

  for i in range(len(transformed_data)):
    print('Raw: ', dict_features[i])
    print('Transformed: ', transformed_data[i])

In [17]:
data_transform()







Instructions for updating:
Use ref() instead.


Instructions for updating:
Use ref() instead.






INFO:tensorflow:Assets written to: /tmp/tmpegjjkvr0/tftransform_tmp/7959d02514564bb2ba7b2bb96eaa217c/assets


INFO:tensorflow:Assets written to: /tmp/tmpegjjkvr0/tftransform_tmp/7959d02514564bb2ba7b2bb96eaa217c/assets


INFO:tensorflow:tensorflow_text is not available.


INFO:tensorflow:tensorflow_text is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:Assets written to: /tmp/tmpegjjkvr0/tftransform_tmp/548baaa1427e45b38b340b2423b2cba6/assets


INFO:tensorflow:Assets written to: /tmp/tmpegjjkvr0/tftransform_tmp/548baaa1427e45b38b340b2423b2cba6/assets


INFO:tensorflow:tensorflow_text is not available.


INFO:tensorflow:tensorflow_text is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:struct2tensor is not available.


Raw:  {'pm10': 98.67, 'no2': 14.1, 'so2': 44.38, 'soot': 34.81}
Transformed:  {'no2_normalized': -18.577978, 'pm10_normalized': 0.34071696, 'so2_normalized': -1.424593, 'soot_normalized': 0.2834235}
Raw:  {'pm10': 52.33, 'no2': 14.1, 'so2': 29.75, 'soot': 33.06}
Transformed:  {'no2_normalized': -18.577978, 'pm10_normalized': 0.16963857, 'so2_normalized': -1.424593, 'soot_normalized': 0.26620758}
Raw:  {'pm10': 74.67, 'no2': 20.5, 'so2': 36.25, 'soot': 39.25}
Transformed:  {'no2_normalized': -12.1779785, 'pm10_normalized': 0.25211358, 'so2_normalized': 4.9754066, 'soot_normalized': 0.3271028}
Raw:  {'pm10': 72.0, 'no2': 17.3, 'so2': 46.44, 'soot': 34.38}
Transformed:  {'no2_normalized': -15.377979, 'pm10_normalized': 0.24225645, 'so2_normalized': 1.7754059, 'soot_normalized': 0.2791933}
Raw:  {'pm10': 81.0, 'no2': 25.64, 'so2': 56.56, 'soot': 45.59}
Transformed:  {'no2_normalized': -7.037979, 'pm10_normalized': 0.2754827, 'so2_normalized': 10.115406, 'soot_normalized': 0.38947368}
Raw: 