Carga de archivos CSV con tf.data en memoria --- 12:17 min
===

* Última modificación: Marzo 6, 2022 | [YouTube]()

Importación de librerías
---

In [1]:
import os

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

import tensorflow as tf

Carga y preparación de datos
---

In [2]:
import numpy as np
import pandas as pd

titanic = pd.read_csv("https://storage.googleapis.com/tf-datasets/titanic/train.csv")
titanic_features = titanic.copy()
titanic_labels = titanic_features.pop("survived")

titanic_features_dict = {
    name: np.array(value) for name, value in titanic_features.items()
}

Iteración usando un slice manual
---

In [3]:
import itertools


def slices(features):
    #
    # count(start=0, step=1)  genera una secuencia de valores igualmente espaciados.
    # Note que no se almacena la cantidad de items de features.
    #
    for i in itertools.count():
        example = {name: values[i] for name, values in features.items()}
        yield example


for example in slices(titanic_features_dict):
    for name, value in example.items():
        print(f"{name:19s}: {value}")
    break

sex                : male
age                : 22.0
n_siblings_spouses : 1
parch              : 0
fare               : 7.25
class              : Third
deck               : unknown
embark_town        : Southampton
alone              : n


Iteración usando from_tensor_slices
---

In [4]:
#
# from_tensor_slices permite iterar sobre los patrones del dataset
#
features_ds = tf.data.Dataset.from_tensor_slices(
    titanic_features_dict,
)

for example in features_ds:
    for name, value in example.items():
        print(f"{name:19s}: {value}")
    break

sex                : b'male'
age                : 22.0
n_siblings_spouses : 1
parch              : 0
fare               : 7.25
class              : b'Third'
deck               : b'unknown'
embark_town        : b'Southampton'
alone              : b'n'


In [5]:
#
# Creación del dataset
#
titanic_ds = tf.data.Dataset.from_tensor_slices(
    (
        titanic_features_dict,
        titanic_labels,
    )
)

titanic_batches = titanic_ds.shuffle(len(titanic_labels))
titanic_batches = titanic_batches.batch(32)

Preprocesamiento de los datos
---

In [6]:
def make_inputs(titanic_features):
    inputs = {}
    for name, column in titanic_features.items():
        dtype = column.dtype
        if dtype == object:
            dtype = tf.string
        else:
            dtype = tf.float32
        inputs[name] = tf.keras.Input(shape=(1,), name=name, dtype=dtype)
    return inputs


def preprocess_numeric_inputs(inputs):
    numeric_inputs = {
        name: input for name, input in inputs.items() if input.dtype == tf.float32
    }
    x = tf.keras.layers.Concatenate()(list(numeric_inputs.values()))
    norm = tf.keras.layers.Normalization()
    norm.adapt(np.array(titanic[numeric_inputs.keys()]))
    return [norm(x)]


def preprocess_categoric_inputs(inputs):
    categoric_inputs = []
    for name, input in inputs.items():
        if input.dtype == tf.float32:
            continue
        lookup = tf.keras.layers.StringLookup(
            vocabulary=np.unique(
                titanic_features[name],
            )
        )
        one_hot = tf.keras.layers.CategoryEncoding(
            num_tokens=lookup.vocabulary_size(),
            output_mode="multi_hot",
        )
        x = lookup(input)
        x = one_hot(x)
        categoric_inputs.append(x)
    return categoric_inputs


inputs = make_inputs(titanic_features)
numeric_inputs = preprocess_numeric_inputs(inputs)
categoric_inputs = preprocess_categoric_inputs(inputs)

preprocessed_inputs = numeric_inputs + categoric_inputs
concatenate_layer = tf.keras.layers.Concatenate()
preprocessed_inputs = concatenate_layer(preprocessed_inputs)

preprocessing_head = tf.keras.Model(inputs, preprocessed_inputs)

In [7]:
def make_titanic_model(preprocessing_head, inputs):

    preprocessed_inputs = preprocessing_head(inputs)
    
    body = tf.keras.Sequential(
        [
            tf.keras.layers.Dense(64),
            tf.keras.layers.Dense(1),
        ],
    )    
    result = body(preprocessed_inputs)

    model = tf.keras.Model(
        inputs,
        result,
    )

    model.compile(
        loss=tf.losses.BinaryCrossentropy(from_logits=True),
        optimizer=tf.optimizers.Adam(),
    )
    return model


titanic_model = make_titanic_model(
    preprocessing_head,
    inputs,
)

history = titanic_model.fit(titanic_batches, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
