Based on a code by : https://www.kaggle.com/cstahl12/titanic-with-keras

### Load environment

In [2]:
from __future__ import print_function
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.optimizers import SGD, RMSprop, Adam
from keras.layers import Dense, Activation, Dropout

Using TensorFlow backend.


In [3]:
raw_train = pd.read_csv('./train.csv', index_col = 0)
raw_train['is_test'] = 0
raw_test = pd.read_csv('./test.csv', index_col = 0)
raw_test['is_test'] = 1

In [4]:
all_data = pd.concat((raw_train,raw_test),axis=0)

### Functions to preprocess the data

In [5]:
def get_title_last_name(name):
    full_name = name.str.split(',', n=0, expand=True)
    last_name = full_name[0]
    titles = full_name[1].str.split('.', n=0, expand=True)
    titles = titles[0]
    return(titles)

def get_titles_from_names(df):
    df['Title'] = get_title_last_name(df['Name'])
    df = df.drop(['Name'], axis=1)
    return(df)

def get_dummy_cats(df):
    return(pd.get_dummies(df,columns=['Title','Pclass','Sex','Embarked','Cabin','Cabin_letter']))

def get_cabin_letter(df):
    df['Cabin'].fillna('Z', inplace=True)
    df['Cabin_letter'] = df['Cabin'].str[0]
    return(df)


def process_data(df):
    #preprocess titles, cabin, embarked
    df = get_titles_from_names(df)
    df['Embarked'].fillna('S', inplace=True)
    df=get_cabin_letter(df)
    
    #drop remaining features
    df = df.drop(['Ticket','Fare'], axis=1)
    
    #create dummy values for categorical features
    df = get_dummy_cats(df)
    
    return(df)

proc_data = process_data(all_data)
proc_train = proc_data[proc_data['is_test'] == 0]
proc_test = proc_data[proc_data['is_test'] == 1]


    
    
    
    
    
    


In [6]:
proc_data.head()

Unnamed: 0_level_0,Age,Parch,SibSp,Survived,is_test,Title_ Capt,Title_ Col,Title_ Don,Title_ Dona,Title_ Dr,...,Cabin_Z,Cabin_letter_A,Cabin_letter_B,Cabin_letter_C,Cabin_letter_D,Cabin_letter_E,Cabin_letter_F,Cabin_letter_G,Cabin_letter_T,Cabin_letter_Z
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,22.0,0,1,0.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
2,38.0,0,1,1.0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,26.0,0,0,1.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
4,35.0,0,1,1.0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
5,35.0,0,0,0.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1


### Build Network to predict missing ages

In [7]:
for_age_train = proc_data.drop(['Survived', 'is_test'], axis = 1).dropna(axis=0)
X_train_age=for_age_train.drop('Age', axis=1)
y_train_age=for_age_train['Age']



In [9]:
#create model
tmodel = Sequential()
tmodel.add(Dense(input_dim=X_train_age.shape[1], units=128,
                kernel_initializer='normal',
                bias_initializer='zeros'))
tmodel.add(Activation('relu'))

for i in range(0,8):
    tmodel.add(Dense(units=64,kernel_initializer='normal',
                   bias_initializer='zeros'))
    tmodel.add(Activation('relu'))
    tmodel.add(Dropout(.25))
    
tmodel.add(Dense(units=1))
tmodel.add(Activation('linear'))

tmodel.compile(loss='mean_squared_error', optimizer='rmsprop')


In [10]:
tmodel.fit(X_train_age.values, y_train_age.values, epochs=600, verbose=2)

Epoch 1/600
0s - loss: 603.5002
Epoch 2/600
0s - loss: 236.4572
Epoch 3/600
0s - loss: 209.7210
Epoch 4/600
0s - loss: 191.3419
Epoch 5/600
0s - loss: 188.2952
Epoch 6/600
0s - loss: 193.1680
Epoch 7/600
0s - loss: 184.4908
Epoch 8/600
0s - loss: 170.1633
Epoch 9/600
0s - loss: 169.0257
Epoch 10/600
0s - loss: 160.9776
Epoch 11/600
0s - loss: 163.0992
Epoch 12/600
0s - loss: 162.9110
Epoch 13/600
0s - loss: 154.6564
Epoch 14/600
0s - loss: 153.8991
Epoch 15/600
0s - loss: 139.2260
Epoch 16/600
0s - loss: 145.9740
Epoch 17/600
0s - loss: 136.8496
Epoch 18/600
0s - loss: 129.7881
Epoch 19/600
0s - loss: 130.4263
Epoch 20/600
0s - loss: 123.2582
Epoch 21/600
0s - loss: 124.8989
Epoch 22/600
0s - loss: 125.2096
Epoch 23/600
0s - loss: 121.8449
Epoch 24/600
0s - loss: 120.0710
Epoch 25/600
0s - loss: 128.6148
Epoch 26/600
0s - loss: 120.6659
Epoch 27/600
0s - loss: 116.7738
Epoch 28/600
0s - loss: 113.5677
Epoch 29/600
0s - loss: 120.9264
Epoch 30/600
0s - loss: 116.9315
Epoch 31/600
0s - l

0s - loss: 94.9011
Epoch 251/600
0s - loss: 86.3108
Epoch 252/600
0s - loss: 85.5782
Epoch 253/600
0s - loss: 90.5309
Epoch 254/600
0s - loss: 88.9398
Epoch 255/600
0s - loss: 88.1143
Epoch 256/600
0s - loss: 91.5138
Epoch 257/600
0s - loss: 92.2985
Epoch 258/600
0s - loss: 91.4078
Epoch 259/600
0s - loss: 85.9500
Epoch 260/600
0s - loss: 90.9490
Epoch 261/600
0s - loss: 88.8892
Epoch 262/600
0s - loss: 92.7581
Epoch 263/600
0s - loss: 87.3014
Epoch 264/600
0s - loss: 84.5949
Epoch 265/600
0s - loss: 88.6961
Epoch 266/600
0s - loss: 88.5077
Epoch 267/600
0s - loss: 87.1133
Epoch 268/600
0s - loss: 89.4018
Epoch 269/600
0s - loss: 86.9905
Epoch 270/600
0s - loss: 88.5680
Epoch 271/600
0s - loss: 88.6760
Epoch 272/600
0s - loss: 91.5765
Epoch 273/600
0s - loss: 89.3371
Epoch 274/600
0s - loss: 87.1678
Epoch 275/600
0s - loss: 86.0569
Epoch 276/600
0s - loss: 90.4894
Epoch 277/600
0s - loss: 90.7653
Epoch 278/600
0s - loss: 91.4343
Epoch 279/600
0s - loss: 86.9783
Epoch 280/600
0s - loss:

0s - loss: 80.7058
Epoch 501/600
0s - loss: 79.7777
Epoch 502/600
0s - loss: 82.9394
Epoch 503/600
0s - loss: 80.7799
Epoch 504/600
0s - loss: 82.6057
Epoch 505/600
0s - loss: 82.1199
Epoch 506/600
0s - loss: 78.4089
Epoch 507/600
0s - loss: 79.8514
Epoch 508/600
0s - loss: 81.6625
Epoch 509/600
0s - loss: 81.4602
Epoch 510/600
0s - loss: 84.8239
Epoch 511/600
0s - loss: 83.0950
Epoch 512/600
0s - loss: 80.0576
Epoch 513/600
0s - loss: 80.9962
Epoch 514/600
0s - loss: 82.5655
Epoch 515/600
0s - loss: 80.6302
Epoch 516/600
0s - loss: 78.0197
Epoch 517/600
0s - loss: 80.3279
Epoch 518/600
0s - loss: 82.6930
Epoch 519/600
0s - loss: 83.3085
Epoch 520/600
0s - loss: 83.1506
Epoch 521/600
0s - loss: 80.6281
Epoch 522/600
0s - loss: 81.6301
Epoch 523/600
0s - loss: 80.2654
Epoch 524/600
0s - loss: 80.7731
Epoch 525/600
0s - loss: 79.5113
Epoch 526/600
0s - loss: 81.8289
Epoch 527/600
0s - loss: 81.1323
Epoch 528/600
0s - loss: 81.7670
Epoch 529/600
0s - loss: 82.4840
Epoch 530/600
0s - loss:

<keras.callbacks.History at 0x7f7ebd95ed30>

In [11]:
train_data = proc_train
train_data.loc[train_data['Age'].isnull()]

Unnamed: 0_level_0,Age,Parch,SibSp,Survived,is_test,Title_ Capt,Title_ Col,Title_ Don,Title_ Dona,Title_ Dr,...,Cabin_Z,Cabin_letter_A,Cabin_letter_B,Cabin_letter_C,Cabin_letter_D,Cabin_letter_E,Cabin_letter_F,Cabin_letter_G,Cabin_letter_T,Cabin_letter_Z
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,,0,0,0.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
18,,0,0,1.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
20,,0,0,1.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
27,,0,0,0.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
29,,0,0,1.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
30,,0,0,0.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
32,,0,1,1.0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
33,,0,0,1.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
37,,0,0,1.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
43,,0,0,0.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1


In [12]:
to_pred = train_data.loc[train_data['Age'].isnull()].drop(['Age','Survived','is_test'],axis=1)
p = tmodel.predict(to_pred.values)
train_data['Age'].loc[train_data['Age'].isnull()] = p

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [13]:
test_data = proc_test
to_pred = test_data.loc[test_data['Age'].isnull()].drop(['Age','Survived','is_test'],axis=1)
p = tmodel.predict(to_pred.values)
test_data['Age'].loc[test_data['Age'].isnull()] = p

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [15]:
train_data.loc[train_data['Age'].isnull()]

Unnamed: 0_level_0,Age,Parch,SibSp,Survived,is_test,Title_ Capt,Title_ Col,Title_ Don,Title_ Dona,Title_ Dr,...,Cabin_Z,Cabin_letter_A,Cabin_letter_B,Cabin_letter_C,Cabin_letter_D,Cabin_letter_E,Cabin_letter_F,Cabin_letter_G,Cabin_letter_T,Cabin_letter_Z
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [16]:
y = pd.get_dummies(train_data['Survived'])
y.head()

Unnamed: 0_level_0,0.0,1.0
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,0
2,0,1
3,0,1
4,0,1
5,1,0


In [17]:
X = train_data.drop(['Survived','is_test'], axis=1)

In [42]:
#create model
model = Sequential()
model.add(Dense(input_dim=X.shape[1],units=128, kernel_initializer='normal', bias_initializer='zeros'))
model.add(Activation('relu'))

for i in range(0,15):
    model.add(Dense(units=128, kernel_initializer='he_normal', bias_initializer='zeros'))
    model.add(Activation('relu'))
    model.add(Dropout(.40))

model.add(Dense(units=2))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])




In [43]:
model.fit(X.values, y.values, epochs=500, verbose=2)

Epoch 1/500
0s - loss: 2.8831 - acc: 0.5174
Epoch 2/500
0s - loss: 0.9472 - acc: 0.5297
Epoch 3/500
0s - loss: 0.7578 - acc: 0.5623
Epoch 4/500
0s - loss: 0.7018 - acc: 0.5847
Epoch 5/500
0s - loss: 0.6877 - acc: 0.5769
Epoch 6/500
0s - loss: 0.6924 - acc: 0.5948
Epoch 7/500
0s - loss: 0.6742 - acc: 0.5971
Epoch 8/500
0s - loss: 0.6886 - acc: 0.5982
Epoch 9/500
0s - loss: 0.6816 - acc: 0.5971
Epoch 10/500
0s - loss: 0.6704 - acc: 0.6027
Epoch 11/500
0s - loss: 0.6901 - acc: 0.5948
Epoch 12/500
0s - loss: 0.6639 - acc: 0.6105
Epoch 13/500
0s - loss: 0.6716 - acc: 0.6049
Epoch 14/500
0s - loss: 0.6727 - acc: 0.6195
Epoch 15/500
0s - loss: 0.6637 - acc: 0.6072
Epoch 16/500
0s - loss: 0.6730 - acc: 0.6117
Epoch 17/500
0s - loss: 0.6685 - acc: 0.6128
Epoch 18/500
0s - loss: 0.6640 - acc: 0.6117
Epoch 19/500
0s - loss: 0.6529 - acc: 0.6139
Epoch 20/500
0s - loss: 0.6475 - acc: 0.6150
Epoch 21/500
0s - loss: 0.6565 - acc: 0.6139
Epoch 22/500
0s - loss: 0.6379 - acc: 0.6173
Epoch 23/500
0s - l

0s - loss: 0.3777 - acc: 0.8687
Epoch 183/500
0s - loss: 0.3854 - acc: 0.8855
Epoch 184/500
0s - loss: 0.3526 - acc: 0.8799
Epoch 185/500
0s - loss: 0.3410 - acc: 0.8822
Epoch 186/500
0s - loss: 0.3694 - acc: 0.8765
Epoch 187/500
0s - loss: 0.3607 - acc: 0.8833
Epoch 188/500
0s - loss: 0.3940 - acc: 0.8597
Epoch 189/500
0s - loss: 0.3811 - acc: 0.8552
Epoch 190/500
0s - loss: 0.3720 - acc: 0.8754
Epoch 191/500
0s - loss: 0.3790 - acc: 0.8777
Epoch 192/500
0s - loss: 0.3905 - acc: 0.8754
Epoch 193/500
0s - loss: 0.3801 - acc: 0.8698
Epoch 194/500
0s - loss: 0.3598 - acc: 0.8911
Epoch 195/500
0s - loss: 0.3654 - acc: 0.8810
Epoch 196/500
0s - loss: 0.3806 - acc: 0.8788
Epoch 197/500
0s - loss: 0.3892 - acc: 0.8721
Epoch 198/500
0s - loss: 0.3782 - acc: 0.8754
Epoch 199/500
0s - loss: 0.3753 - acc: 0.8732
Epoch 200/500
0s - loss: 0.3654 - acc: 0.8721
Epoch 201/500
0s - loss: 0.3762 - acc: 0.8721
Epoch 202/500
0s - loss: 0.3519 - acc: 0.8732
Epoch 203/500
0s - loss: 0.3397 - acc: 0.8844
Ep

0s - loss: 0.3521 - acc: 0.8866
Epoch 363/500
0s - loss: 0.3308 - acc: 0.8956
Epoch 364/500
0s - loss: 0.3221 - acc: 0.8934
Epoch 365/500
0s - loss: 0.3452 - acc: 0.8889
Epoch 366/500
0s - loss: 0.3571 - acc: 0.8844
Epoch 367/500
0s - loss: 0.3652 - acc: 0.8844
Epoch 368/500
0s - loss: 0.3558 - acc: 0.8855
Epoch 369/500
0s - loss: 0.3262 - acc: 0.8956
Epoch 370/500
0s - loss: 0.3469 - acc: 0.8889
Epoch 371/500
0s - loss: 0.3292 - acc: 0.8866
Epoch 372/500
0s - loss: 0.3019 - acc: 0.9001
Epoch 373/500
0s - loss: 0.3442 - acc: 0.8956
Epoch 374/500
0s - loss: 0.3236 - acc: 0.8956
Epoch 375/500
0s - loss: 0.3247 - acc: 0.8889
Epoch 376/500
0s - loss: 0.3250 - acc: 0.8990
Epoch 377/500
0s - loss: 0.3050 - acc: 0.8956
Epoch 378/500
0s - loss: 0.3215 - acc: 0.8934
Epoch 379/500
0s - loss: 0.3303 - acc: 0.8889
Epoch 380/500
0s - loss: 0.3308 - acc: 0.8900
Epoch 381/500
0s - loss: 0.3418 - acc: 0.8833
Epoch 382/500
0s - loss: 0.3993 - acc: 0.8765
Epoch 383/500
0s - loss: 0.3469 - acc: 0.8990
Ep

<keras.callbacks.History at 0x7f7ebdbca978>

In [21]:
test_data.columns

Index(['Age', 'Parch', 'SibSp', 'Survived', 'is_test', 'Title_ Capt',
       'Title_ Col', 'Title_ Don', 'Title_ Dona', 'Title_ Dr',
       ...
       'Cabin_Z', 'Cabin_letter_A', 'Cabin_letter_B', 'Cabin_letter_C',
       'Cabin_letter_D', 'Cabin_letter_E', 'Cabin_letter_F', 'Cabin_letter_G',
       'Cabin_letter_T', 'Cabin_letter_Z'],
      dtype='object', length=227)

In [38]:
p_survived = model.predict_classes(test_data.drop(['Survived','is_test'],axis = 1).values)




 32/418 [=>............................] - ETA: 0s

In [39]:
submission = pd.DataFrame()
submission['PassengerId'] = test_data.index
submission['Survived'] = p_survived

In [40]:
submission.shape

(418, 2)

In [41]:
submission.to_csv('titanic_keras1.csv', index=False)