# Titanic: Keras

An attempt at using Keras to create predictions for Kaggle Titanic introductory competition.

In [10]:
import os
import csv
import numpy as np
import pandas as pd
import tensorflow as tf
from IPython.display import display
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing

# check working directory
print(os.getcwd())
tf.__version__

c:\Users\john.dls17\github\kg_titanic


'2.5.0'

In [90]:
# load data
labelled = pd.read_csv("titanic/train.csv")
test = pd.read_csv("titanic/test.csv")

# verify data contents
display(labelled.head())
display(test.head())
print(labelled.shape)
print(test.shape)
print(labelled.isnull().sum())

# drop columns that will not be used
labelled=labelled.drop(['Name','Ticket','Cabin','Embarked'],axis=1)

# convert columns to categories
labelled.Pclass = labelled.Pclass.astype('category')
labelled.Sex = labelled.Sex.astype('category')

# fill missing values
labelled['Age'] = labelled['Age'].fillna(20)

# split into training and validation sets
train, val = train_test_split(labelled, test_size=0.2)

print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')
train.head(20)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


(891, 12)
(418, 11)
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
712 train examples
179 validation examples
418 test examples


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
772,773,0,2,female,57.0,0,0,10.5
206,207,0,3,male,32.0,1,0,15.85
272,273,1,2,female,41.0,0,1,19.5
619,620,0,2,male,26.0,0,0,10.5
343,344,0,2,male,25.0,0,0,13.0
164,165,0,3,male,1.0,4,1,39.6875
378,379,0,3,male,20.0,0,0,4.0125
861,862,0,2,male,21.0,1,0,11.5
530,531,1,2,female,2.0,1,1,26.0
21,22,1,2,male,34.0,0,0,13.0


In [91]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('Survived')
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(batch_size)
  return ds

In [92]:
batch_size = 5
train_ds = df_to_dataset(train, batch_size=batch_size)

[(train_features, label_batch)] = train_ds.take(1)
print('Every feature:', list(train_features.keys()))
print('A batch of ages:', train_features['Age'])
print('A batch of targets:', label_batch )

Every feature: ['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
A batch of ages: tf.Tensor([54. 29. 24. 29. 24.], shape=(5,), dtype=float64)
A batch of targets: tf.Tensor([1 0 0 1 1], shape=(5,), dtype=int64)


In [93]:
def get_normalization_layer(name, dataset):
  # Create a Normalization layer for our feature.
  normalizer = preprocessing.Normalization(axis=None)

  # Prepare a Dataset that only yields our feature.
  feature_ds = dataset.map(lambda x, y: x[name])

  # Learn the statistics of the data.
  normalizer.adapt(feature_ds)

  return normalizer

In [94]:
photo_count_col = train_features['Fare']
layer = get_normalization_layer('Fare', train_ds)
layer(photo_count_col)

<tf.Tensor: shape=(5, 1), dtype=float32, numpy=
array([[ 0.8908543 ],
       [-0.4755191 ],
       [-0.4915332 ],
       [-0.12369471],
       [ 0.63333434]], dtype=float32)>

In [95]:
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
  # Create a StringLookup layer which will turn strings into integer indices
  if dtype == 'string':
    index = preprocessing.StringLookup(max_tokens=max_tokens)
  else:
    index = preprocessing.IntegerLookup(max_tokens=max_tokens)

  # Prepare a Dataset that only yields our feature
  feature_ds = dataset.map(lambda x, y: x[name])

  # Learn the set of possible values and assign them a fixed integer index.
  index.adapt(feature_ds)

  # Create a Discretization for our integer indices.
  encoder = preprocessing.CategoryEncoding(num_tokens=index.vocabulary_size())

  # Apply one-hot encoding to our indices. The lambda function captures the
  # layer so we can use them, or include them in the functional model later.
  return lambda feature: encoder(index(feature))

In [96]:
type_col = train_features['Sex']
layer = get_category_encoding_layer('Sex', train_ds, 'string')
layer(type_col)

<tf.Tensor: shape=(5, 4), dtype=float32, numpy=
array([[0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.]], dtype=float32)>

In [3]:
prediction = pd.DataFrame(columns=('PassengerId', 'Survived'))

# a basic loop assigning arbitrary predictions
for x in range(len(test)):
    if test.loc[x,"Age"] > 30:
        prediction.loc[x] = [test.loc[x,"PassengerId"],0]
    else:
        prediction.loc[x] = [test.loc[x,"PassengerId"],1]

# verify prediction characteristics
display(len(prediction))
display(prediction.head())


418

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,1


In [None]:
# publish submission file
prediction.to_csv("titanic/predictions.csv",index=False)