In [1]:
import os

os.environ["KERAS_BACKEND"] = "tensorflow"

In [2]:
import tensorflow as tf
import pandas as pd
import keras as kr
import pathlib as pl

Separamos una porcion de los datos para entrenamiento y validación


In [3]:
titanic_data_path = pl.Path("../data/raw/train.csv").resolve()
titanic_data = pd.read_csv(filepath_or_buffer=titanic_data_path)

In [4]:
titanic_data.loc[titanic_data["Embarked"].isna(), "Embarked"] = "D"
titanic_data = titanic_data.loc[
    :,
    [
        "Survived",
        "Pclass",
        "Sex",
        "Age",
        "SibSp",
        "Parch",
        "Fare",
        "Embarked",
    ],
]

In [5]:
titanic_data_validation = titanic_data.sample(frac=0.2, random_state=1337)
titanic_data_train = titanic_data.drop(titanic_data_validation.index)

print(
    "Contamos con {} registros para entrenamiento y {} registros para validar modelos".format(
        titanic_data_train.shape[0],
        titanic_data_validation.shape[0],
    )
)

Contamos con 713 registros para entrenamiento y 178 registros para validar modelos


In [6]:
def dataframe_to_dataset(
    dataframe: pd.DataFrame | pd.Series,
    target_col: str,
) -> tf.data.Dataset:
    dataframe = dataframe.copy()
    labels = dataframe.pop(item=target_col)
    ds = tf.data.Dataset.from_tensor_slices(tensors=(dict(dataframe), labels))
    ds = ds.shuffle(buffer_size=dataframe.shape[0])
    return ds

In [7]:
titanic_dataset_train = dataframe_to_dataset(
    dataframe=titanic_data_train, target_col="Survived"
)
titanic_dataset_validation = dataframe_to_dataset(
    dataframe=titanic_data_validation, target_col="Survived"
)

print(
    "Contamos con {} registros para entrenamiento y {} registros para validar modelos".format(
        len(titanic_dataset_train),
        len(titanic_dataset_validation),
    )
)

Contamos con 713 registros para entrenamiento y 178 registros para validar modelos


In [8]:
for data, target in titanic_dataset_train.take(count=1):
    print("Cada registro cuenta con los siguientes datos:  {}".format(data))
    print("Cada registro cuenta con el siguiente label:  {}".format(target))

Cada registro cuenta con los siguientes datos:  {'Pclass': <tf.Tensor: shape=(), dtype=int64, numpy=3>, 'Sex': <tf.Tensor: shape=(), dtype=string, numpy=b'male'>, 'Age': <tf.Tensor: shape=(), dtype=float64, numpy=32.0>, 'SibSp': <tf.Tensor: shape=(), dtype=int64, numpy=0>, 'Parch': <tf.Tensor: shape=(), dtype=int64, numpy=0>, 'Fare': <tf.Tensor: shape=(), dtype=float64, numpy=56.4958>, 'Embarked': <tf.Tensor: shape=(), dtype=string, numpy=b'S'>}
Cada registro cuenta con el siguiente label:  1


In [9]:
titanic_dataset_train = titanic_dataset_train.batch(batch_size=32)
titanic_dataset_validation = titanic_dataset_validation.batch(batch_size=32)

In [10]:
feature_space = kr.utils.FeatureSpace(
    features={
        "Pclass": kr.utils.FeatureSpace.integer_categorical(num_oov_indices=0),
        "Sex": kr.utils.FeatureSpace.string_categorical(num_oov_indices=0),
        "Age": kr.utils.FeatureSpace.float_discretized(num_bins=8),
        "SibSp": kr.utils.FeatureSpace.float_normalized(),
        "Parch": kr.utils.FeatureSpace.float_normalized(),
        "Fare": kr.utils.FeatureSpace.float_normalized(),
        "Embarked": kr.utils.FeatureSpace.string_categorical(num_oov_indices=0),
    },
    crosses=[
        kr.utils.FeatureSpace.cross(
            feature_names=(
                "Sex",
                "Age",
            ),
            crossing_dim=16,
        ),
    ],
    output_mode="concat",
)

In [11]:
titanic_dataset_train_without_target = titanic_dataset_train.map(
    map_func=lambda data, _: data
)

In [12]:
feature_space.adapt(
    dataset=titanic_dataset_train_without_target,
)

In [13]:
for data, _ in titanic_dataset_train.take(count=1):
    data_processed = feature_space(data)
    print("Dimensiones del dataset procesado: {}".format(data_processed.shape))
    print("Tipo de datos del dataset procesado: {}".format(data_processed.dtype))

Dimensiones del dataset procesado: (32, 36)
Tipo de datos del dataset procesado: <dtype: 'float32'>


In [14]:
preprocessed_dataset_train = titanic_dataset_train.map(
    map_func=lambda data, target: (feature_space(data), target)
)

preprocessed_dataset_validation = titanic_dataset_validation.map(
    map_func=lambda data, target: (feature_space(data), target)
)

In [15]:
dict_inputs = feature_space.get_inputs()
encoded_features = feature_space.get_encoded_features()

In [16]:
dict_inputs

{'Pclass': <KerasTensor shape=(None, 1), dtype=int32, sparse=None, name=Pclass>,
 'Sex': <KerasTensor shape=(None, 1), dtype=string, sparse=None, name=Sex>,
 'Age': <KerasTensor shape=(None, 1), dtype=float32, sparse=None, name=Age>,
 'SibSp': <KerasTensor shape=(None, 1), dtype=float32, sparse=None, name=SibSp>,
 'Parch': <KerasTensor shape=(None, 1), dtype=float32, sparse=None, name=Parch>,
 'Fare': <KerasTensor shape=(None, 1), dtype=float32, sparse=None, name=Fare>,
 'Embarked': <KerasTensor shape=(None, 1), dtype=string, sparse=None, name=Embarked>}

In [17]:
encoded_features

<KerasTensor shape=(None, 36), dtype=float32, sparse=False, name=keras_tensor_13>

In [18]:
layers = kr.models.Sequential(
    layers=[
        kr.layers.Dense(units=32, activation=kr.activations.relu),
        kr.layers.Dense(units=32, activation=kr.activations.relu),
        kr.layers.Dense(units=32, activation=kr.activations.relu),
        kr.layers.Dense(units=32, activation=kr.activations.relu),
        kr.layers.Dense(units=32, activation=kr.activations.relu),
        kr.layers.Dropout(rate=0.5),
        kr.layers.Dense(units=1, activation=kr.activations.sigmoid),
    ]
)

predictions = layers(encoded_features)

In [19]:
training_model = kr.Model(
    inputs=encoded_features,
    outputs=predictions,
)
training_model.compile(
    optimizer=kr.optimizers.Adam(),
    loss=kr.losses.binary_crossentropy,
    metrics=[kr.metrics.BinaryAccuracy()],
)

In [20]:
inference_model = kr.Model(
    inputs=dict_inputs,
    outputs=predictions,
)

In [21]:
training_model.fit(
    preprocessed_dataset_train,
    epochs=20,
    validation_data=preprocessed_dataset_validation,
    verbose=2,
)

Epoch 1/20
23/23 - 2s - 86ms/step - binary_accuracy: 0.6073 - loss: 0.6704 - val_binary_accuracy: 0.6404 - val_loss: 0.6318
Epoch 2/20
23/23 - 0s - 9ms/step - binary_accuracy: 0.6746 - loss: 0.6069 - val_binary_accuracy: 0.7809 - val_loss: 0.5570
Epoch 3/20
23/23 - 0s - 8ms/step - binary_accuracy: 0.7616 - loss: 0.5222 - val_binary_accuracy: 0.8146 - val_loss: 0.4974
Epoch 4/20
23/23 - 0s - 7ms/step - binary_accuracy: 0.7966 - loss: 0.5066 - val_binary_accuracy: 0.8090 - val_loss: 0.4819
Epoch 5/20
23/23 - 0s - 6ms/step - binary_accuracy: 0.8149 - loss: 0.4548 - val_binary_accuracy: 0.8146 - val_loss: 0.4689
Epoch 6/20
23/23 - 0s - 7ms/step - binary_accuracy: 0.8247 - loss: 0.4514 - val_binary_accuracy: 0.8090 - val_loss: 0.4593
Epoch 7/20
23/23 - 0s - 7ms/step - binary_accuracy: 0.8191 - loss: 0.4193 - val_binary_accuracy: 0.8202 - val_loss: 0.4612
Epoch 8/20
23/23 - 0s - 7ms/step - binary_accuracy: 0.8289 - loss: 0.4212 - val_binary_accuracy: 0.7978 - val_loss: 0.4622
Epoch 9/20
23/2

<keras.src.callbacks.history.History at 0x26baaf0f190>