[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1p1VrpeaNa8tCuRjZXZAueUEupP5RvKm3?usp=sharing)


# Working with tables

## Titanic dataset download and overview

In [None]:
import tensorflow as tf
import pandas as pd
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers.experimental import preprocessing

In [None]:
pd.set_option('max_rows', 10000)

### Overview

In [None]:
TRAIN_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/train.csv"
TEST_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/eval.csv"



## Data preparation and exploratory analysis


### Split the dataframe into train, validation and test

### Exploratory analysis

## Feature engineering

### Numeric columns

In [None]:
train.describe()
# You can see that the min values for most of the numeric fields are close to 0.
# The maximum values vary a fair bit where we have 512 for the fair paid but maximum number for the number of siblings and spouses is only 8. 
# That's quite a big difference which is why we apply normalization.

`get_normalization_layer` function returns a layer which applies featurewise normalization to numerical features.

In [None]:
def get_normalization_layer(name, dataset):
  normalizer = preprocessing.Normalization(axis=None)
  feature_ds = dataset.map(lambda x, y: x[name])
  normalizer.adapt(feature_ds)

  return normalizer

In [None]:
age_column = train_features['age']
age_column

In [None]:
numeric_layer = get_normalization_layer('age', train_ds)
numeric_layer(age_column)

### Categorical columns


In this dataset, the town that passengers embarked is represented as a string (e.g. 'Southampton', 'Cherbourg', 'Queenstown' or 'unknown'). You cannot feed strings directly to a model. The preprocessing layer takes care of representing strings as a one-hot vector.

In [None]:
train.embark_town.unique()

In [None]:
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
  # Create a StringLookup layer which will turn strings into integer indices
  if dtype == 'string':
    index = preprocessing.StringLookup(max_tokens=max_tokens)
  else:
    index = preprocessing.IntegerLookup(max_tokens=max_tokens)

  # Prepare a Dataset that only yields our feature
  feature_ds = dataset.map(lambda x, y: x[name])

  # Learn the set of possible values and assign them a fixed integer index.
  index.adapt(feature_ds)

  # Create a Discretization for our integer indices.
  encoder = preprocessing.CategoryEncoding(num_tokens=index.vocabulary_size())

  # Apply one-hot encoding to our indices. The lambda function captures the
  # layer so we can use them, or include them in the functional model later.
  return lambda feature: encoder(index(feature))

In [None]:
embark_town_column = train_features['embark_town']
embark_town_column

In [None]:
categorical_layer = get_category_encoding_layer('embark_town', train_ds, 'string')
categorical_layer(embark_town_column)

### Creating a pipeline

In [None]:
batch_size = 64
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [None]:
all_inputs = []
encoded_features = []
numeric_columns = ['age', 'n_siblings_spouses', 'parch', 'fare']

# Numeric features.
for header in numeric_columns:
  numeric_column = tf.keras.Input(shape=(1,), name=header)
  normalization_layer = get_normalization_layer(header, train_ds)
  encoded_numeric_column = normalization_layer(numeric_column)
  all_inputs.append(numeric_column)
  encoded_features.append(encoded_numeric_column)

In [None]:
train.columns

In [None]:
# Categorical features encoded as string.

categorical_columns = ['sex', 'class', 'embark_town', 'deck', 'alone']
for header in categorical_columns:
  categorical_column = tf.keras.Input(shape=(1,), name=header, dtype='string')
  encoding_layer = get_category_encoding_layer(header, train_ds, dtype='string', max_tokens=5)
  encoded_categorical_column = encoding_layer(categorical_column)
  all_inputs.append(categorical_column)
  encoded_features.append(encoded_categorical_column)

## Create, compile and train the model

### Create and compile model

### Train and evaluate model

### Inference on new data