In [1]:
import keras as kr
import pandas as pd
import tensorflow as tf
from pathlib import Path

In [2]:
data_raw_path = Path("../data/raw/train.csv")
data_raw_path.resolve()
data_raw = pd.read_csv(filepath_or_buffer=data_raw_path)
features = data_raw.copy().loc[
    :,
    [
        "Pclass",
        "Sex",
        "Age",
        "SibSp",
        "Parch",
        "Fare",
        "Embarked",
    ],
]

features.loc[features["Embarked"].isna(), "Embarked"] = "D"
features

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.2500,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.9250,S
3,1,female,35.0,1,0,53.1000,S
4,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...
886,2,male,27.0,0,0,13.0000,S
887,1,female,19.0,0,0,30.0000,S
888,3,female,,1,2,23.4500,S
889,1,male,26.0,0,0,30.0000,C


In [3]:
features["Fare"].shape

(891,)

In [4]:
labels = data_raw.copy().pop(item="Survived")

In [5]:
# change types of features
features = features.astype(
    dtype={
        "Sex": "string",
        "Embarked": "string",
    }
)

In [6]:
inputs = {
    "Pclass": kr.Input(
        shape=(1,),
        dtype=features.dtypes["Pclass"],
        name="Pclass",
    ),
    "Sex": kr.Input(
        shape=(1,),
        dtype=features.dtypes["Sex"],
        name="Sex",
    ),
    "Age": kr.Input(
        shape=(1,),
        dtype=features.dtypes["Age"],
        name="Age",
    ),
    "SibSp": kr.Input(
        shape=(1,),
        dtype=features.dtypes["SibSp"],
        name="SibSp",
    ),
    "Parch": kr.Input(
        shape=(1,),
        dtype=features.dtypes["Parch"],
        name="Parch",
    ),
    "Fare": kr.Input(
        shape=(1,),
        dtype=features.dtypes["Fare"],
        name="Fare",
    ),
    "Embarked": kr.Input(
        shape=(1,),
        dtype=features.dtypes["Embarked"],
        name="Embarked",
    ),
}

pclass_output = kr.layers.CategoryEncoding(num_tokens=3)
gender_output = kr.layers.StringLookup(vocabulary=features["Sex"].unique())
age_output = kr.layers.Discretization(bin_boundaries=[0, 6, 12, 18, 26, 59])
sibsp_output = kr.layers.Normalization()
parch_output = kr.layers.Normalization()
fare_output = kr.layers.Normalization()
embarked_output = kr.layers.StringLookup(vocabulary=features["Embarked"].unique())

outputs = {
    "Pclass": pclass_output(inputs["Pclass"]),
    "Sex": gender_output(inputs["Sex"]),
    "Age": age_output(inputs["Age"]),
    "SibSp": sibsp_output(inputs["SibSp"]),
    "Parch": parch_output(inputs["Parch"]),
    "Fare": fare_output(inputs["Fare"]),
    "Embarked": embarked_output(inputs["Embarked"]),
}

In [7]:
preprocessing_model = kr.Model(inputs, outputs)

In [8]:
features_processed = preprocessing_model(dict(features))

In [9]:
dataset = tf.data.Dataset.from_tensor_slices(tensors=(dict(features_processed), labels))
dataset

<_TensorSliceDataset element_spec=({'Pclass': TensorSpec(shape=(3,), dtype=tf.float32, name=None), 'Sex': TensorSpec(shape=(1,), dtype=tf.int64, name=None), 'Age': TensorSpec(shape=(1,), dtype=tf.int64, name=None), 'SibSp': TensorSpec(shape=(1,), dtype=tf.float32, name=None), 'Parch': TensorSpec(shape=(1,), dtype=tf.float32, name=None), 'Fare': TensorSpec(shape=(1,), dtype=tf.float32, name=None), 'Embarked': TensorSpec(shape=(1,), dtype=tf.int64, name=None)}, TensorSpec(shape=(), dtype=tf.int64, name=None))>

In [10]:
dataset.save("dataset.keras")

In [11]:
dataset_loaded = tf.data.Dataset.load("dataset.keras")
dataset_loaded

<_LoadDataset element_spec=({'Pclass': TensorSpec(shape=(3,), dtype=tf.float32, name=None), 'SibSp': TensorSpec(shape=(1,), dtype=tf.float32, name=None), 'Embarked': TensorSpec(shape=(1,), dtype=tf.int64, name=None), 'Sex': TensorSpec(shape=(1,), dtype=tf.int64, name=None), 'Age': TensorSpec(shape=(1,), dtype=tf.int64, name=None), 'Parch': TensorSpec(shape=(1,), dtype=tf.float32, name=None), 'Fare': TensorSpec(shape=(1,), dtype=tf.float32, name=None)}, TensorSpec(shape=(), dtype=tf.int64, name=None))>