# Titanic: Keras
[Competition link](https://www.kaggle.com/c/titanic)

This is an attempt at using Keras to create predictions for Kaggle Titanic introductory competition. It also serves as a training exercise for data wrangling with Pandas and Keras/TF.

[Reference example](https://www.kaggle.com/cstahl12/titanic-with-keras)

In [6]:
from __future__ import print_function
import numpy as np
import pandas as pd
import tensorflow as tf
from keras.models import Sequential
from tensorflow.keras.optimizers import SGD, RMSprop, Adam
from keras.layers import Dense, Activation, Dropout

# check working directory
print(os.getcwd())
tf.__version__

c:\Users\john.dls17\github\kg_titanic


'2.5.0'

In [7]:
# load data
raw_train = pd.read_csv("titanic/train.csv",index_col=0)
raw_train['is_test'] = 0
raw_test = pd.read_csv("titanic/test.csv",index_col=0)
raw_test['is_test'] = 1

# verify data contents
display(raw_train.head())
display(raw_test.head())
print(raw_train.shape)
print(raw_test.shape)
print(raw_train.isnull().sum())

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_test
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0


Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_test
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,1
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,1
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,1
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,1
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1


(891, 12)
(418, 11)
Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
is_test       0
dtype: int64


## Dataframe wrangling

Sources:

[Classify structured data using Keras Preprocessing Layers](https://www.tensorflow.org/tutorials/structured_data/preprocessing_layers)

In [9]:
all_data = pd.concat((raw_train, raw_test), axis=0)
all_data

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_test
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,0
2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0
3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,0
4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,0
5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1305,,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S,1
1306,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C,1
1307,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,1
1308,,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S,1


In [11]:
def get_title_last_name(name):
    full_name = name.str.split(', ', n=0, expand=True)
    last_name = full_name[0]
    titles = full_name[1].str.split('.', n=0, expand=True)
    titles = titles[0]
    return(titles)

def get_titles_from_names(df):
    df['Title'] = get_title_last_name(df['Name'])
    df = df.drop(['Name'], axis=1)
    return(df)

def get_dummy_cats(df):
    return(pd.get_dummies(df, columns=['Title', 'Pclass', 'Sex', 'Embarked',
                                       'Cabin', 'Cabin_letter']))

def get_cabin_letter(df):    
    df['Cabin'].fillna('Z', inplace=True)
    df['Cabin_letter'] = df['Cabin'].str[0]    
    return(df)

def process_data(df):
    # preprocess titles, cabin, embarked
    df = get_titles_from_names(df)    
    df['Embarked'].fillna('S', inplace=True)
    df = get_cabin_letter(df)
    
    # drop remaining features
    df = df.drop(['Ticket', 'Fare'], axis=1)
    
    # create dummies for categorial features
    df = get_dummy_cats(df)
    
    return(df)

proc_data = process_data(all_data)
proc_train = proc_data[proc_data['is_test'] == 0]
proc_test = proc_data[proc_data['is_test'] == 1]

proc_data.head()

Unnamed: 0_level_0,Survived,Age,SibSp,Parch,is_test,Title_Capt,Title_Col,Title_Don,Title_Dona,Title_Dr,...,Cabin_Z,Cabin_letter_A,Cabin_letter_B,Cabin_letter_C,Cabin_letter_D,Cabin_letter_E,Cabin_letter_F,Cabin_letter_G,Cabin_letter_T,Cabin_letter_Z
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,22.0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
2,1.0,38.0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,1.0,26.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
4,1.0,35.0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
5,0.0,35.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1


In [12]:
for_age_train = proc_data.drop(['Survived', 'is_test'], axis=1).dropna(axis=0)
X_train_age = for_age_train.drop('Age', axis=1)
y_train_age = for_age_train['Age']

In [13]:
# create model
tmodel = Sequential()
tmodel.add(Dense(input_dim=X_train_age.shape[1], units=128,
                 kernel_initializer='normal', bias_initializer='zeros'))
tmodel.add(Activation('relu'))

for i in range(0, 8):
    tmodel.add(Dense(units=64, kernel_initializer='normal',
                     bias_initializer='zeros'))
    tmodel.add(Activation('relu'))
    tmodel.add(Dropout(.25))

tmodel.add(Dense(units=1))
tmodel.add(Activation('linear'))

tmodel.compile(loss='mean_squared_error', optimizer='rmsprop')

In [14]:
tmodel.fit(X_train_age.values, y_train_age.values, epochs=600, verbose=2)

Epoch 1/600
33/33 - 20s - loss: 547.2176
Epoch 2/600
33/33 - 0s - loss: 231.3556
Epoch 3/600
33/33 - 0s - loss: 209.5515
Epoch 4/600
33/33 - 0s - loss: 199.3435
Epoch 5/600
33/33 - 0s - loss: 199.3244
Epoch 6/600
33/33 - 0s - loss: 185.4433
Epoch 7/600
33/33 - 0s - loss: 181.3978
Epoch 8/600
33/33 - 0s - loss: 177.3151
Epoch 9/600
33/33 - 0s - loss: 177.3075
Epoch 10/600
33/33 - 0s - loss: 164.5198
Epoch 11/600
33/33 - 0s - loss: 173.7698
Epoch 12/600
33/33 - 0s - loss: 150.6389
Epoch 13/600
33/33 - 0s - loss: 145.8751
Epoch 14/600
33/33 - 0s - loss: 157.1503
Epoch 15/600
33/33 - 0s - loss: 147.5205
Epoch 16/600
33/33 - 0s - loss: 128.6395
Epoch 17/600
33/33 - 0s - loss: 147.3926
Epoch 18/600
33/33 - 0s - loss: 151.5238
Epoch 19/600
33/33 - 0s - loss: 137.8134
Epoch 20/600
33/33 - 0s - loss: 135.5658
Epoch 21/600
33/33 - 0s - loss: 132.3635
Epoch 22/600
33/33 - 0s - loss: 127.1529
Epoch 23/600
33/33 - 0s - loss: 132.9336
Epoch 24/600
33/33 - 0s - loss: 124.2775
Epoch 25/600
33/33 - 0s 

<keras.callbacks.History at 0x179c0c931f0>

In [15]:
train_data = proc_train
train_data.loc[train_data['Age'].isnull()]

Unnamed: 0_level_0,Survived,Age,SibSp,Parch,is_test,Title_Capt,Title_Col,Title_Don,Title_Dona,Title_Dr,...,Cabin_Z,Cabin_letter_A,Cabin_letter_B,Cabin_letter_C,Cabin_letter_D,Cabin_letter_E,Cabin_letter_F,Cabin_letter_G,Cabin_letter_T,Cabin_letter_Z
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,0.0,,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
18,1.0,,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
20,1.0,,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
27,0.0,,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
29,1.0,,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
860,0.0,,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
864,0.0,,8,2,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
869,0.0,,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
879,0.0,,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1


In [26]:
to_pred = train_data.loc[train_data['Age'].isnull()].drop(
          ['Age', 'Survived', 'is_test'], axis=1)
p = tmodel.predict(to_pred.values)
train_data.loc[train_data['Age'].isnull(),'Age'] = p

ValueError: Expect x to be a non-empty array or dataset.

In [25]:
test_data = proc_test
to_pred = test_data.loc[test_data['Age'].isnull()].drop(
          ['Age', 'Survived', 'is_test'], axis=1)
p = tmodel.predict(to_pred.values)
test_data.loc[test_data['Age'].isnull(),'Age'] = p

ValueError: Expect x to be a non-empty array or dataset.

In [27]:
train_data.loc[train_data['Age'].isnull()]

Unnamed: 0_level_0,Survived,Age,SibSp,Parch,is_test,Title_Capt,Title_Col,Title_Don,Title_Dona,Title_Dr,...,Cabin_Z,Cabin_letter_A,Cabin_letter_B,Cabin_letter_C,Cabin_letter_D,Cabin_letter_E,Cabin_letter_F,Cabin_letter_G,Cabin_letter_T,Cabin_letter_Z
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [28]:
y = pd.get_dummies(train_data['Survived'])
y.head()

Unnamed: 0_level_0,0.0,1.0
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,0
2,0,1
3,0,1
4,0,1
5,1,0


In [29]:
X = train_data.drop(['Survived', 'is_test'], axis=1)

In [30]:
# create model
model = Sequential()
model.add(Dense(input_dim=X.shape[1], units=128,
                 kernel_initializer='normal', bias_initializer='zeros'))
model.add(Activation('relu'))

for i in range(0, 15):
    model.add(Dense(units=128, kernel_initializer='normal',
                     bias_initializer='zeros'))
    model.add(Activation('relu'))
    model.add(Dropout(.40))

model.add(Dense(units=2))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [31]:
model.fit(X.values, y.values, epochs=500, verbose=2)

Epoch 1/500
28/28 - 5s - loss: 0.6803 - accuracy: 0.6162
Epoch 2/500
28/28 - 0s - loss: 0.6683 - accuracy: 0.6162
Epoch 3/500
28/28 - 0s - loss: 0.6638 - accuracy: 0.6162
Epoch 4/500
28/28 - 0s - loss: 0.6277 - accuracy: 0.6162
Epoch 5/500
28/28 - 0s - loss: 0.6342 - accuracy: 0.6094
Epoch 6/500
28/28 - 0s - loss: 0.6824 - accuracy: 0.5701
Epoch 7/500
28/28 - 0s - loss: 0.6618 - accuracy: 0.6162
Epoch 8/500
28/28 - 0s - loss: 0.6086 - accuracy: 0.6162
Epoch 9/500
28/28 - 0s - loss: 0.5566 - accuracy: 0.6207
Epoch 10/500
28/28 - 0s - loss: 0.5759 - accuracy: 0.7755
Epoch 11/500
28/28 - 0s - loss: 0.5741 - accuracy: 0.7116
Epoch 12/500
28/28 - 0s - loss: 0.5505 - accuracy: 0.8002
Epoch 13/500
28/28 - 0s - loss: 0.5057 - accuracy: 0.8249
Epoch 14/500
28/28 - 0s - loss: 0.5240 - accuracy: 0.8070
Epoch 15/500
28/28 - 0s - loss: 0.4760 - accuracy: 0.8182
Epoch 16/500
28/28 - 0s - loss: 0.4566 - accuracy: 0.8462
Epoch 17/500
28/28 - 0s - loss: 0.4753 - accuracy: 0.8474
Epoch 18/500
28/28 - 0s

<keras.callbacks.History at 0x179e36be880>

In [32]:
test_data.columns

Index(['Survived', 'Age', 'SibSp', 'Parch', 'is_test', 'Title_Capt',
       'Title_Col', 'Title_Don', 'Title_Dona', 'Title_Dr',
       ...
       'Cabin_Z', 'Cabin_letter_A', 'Cabin_letter_B', 'Cabin_letter_C',
       'Cabin_letter_D', 'Cabin_letter_E', 'Cabin_letter_F', 'Cabin_letter_G',
       'Cabin_letter_T', 'Cabin_letter_Z'],
      dtype='object', length=227)

In [33]:
p_survived = model.predict_classes(test_data.drop(['Survived', 'is_test'], axis=1).values)



In [35]:
submission = pd.DataFrame()
submission['PassengerId'] = test_data.index
submission['Survived'] = p_survived
submission.shape

(418, 2)

In [37]:
submission.to_csv('titanic/predictions.csv', index=False)