In [167]:
from __future__ import print_function
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.optimizers import SGD, RMSprop, Adam
from keras.layers import Dense, Activation, Dropout

In [168]:
raw_train = pd.read_csv('./all/train.csv', index_col=0)
raw_test = pd.read_csv('./all/test.csv', index_col=0)

In [169]:
raw_train['is_test'] = 0
raw_test['is_test'] = 1

In [170]:
all_data = pd.concat((raw_train, raw_test), axis=0, sort=False)

In [171]:
all_data

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_test
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,0
2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0
3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,0
4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,0
5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,0
6,0.0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,0
7,0.0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,0
8,0.0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S,0
9,1.0,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,0
10,1.0,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,0


In [172]:
def get_title_last_name(name):
    full_name = name.str.split(', ', n=0, expand=True)
    last_name = full_name[0]
    titles = full_name[1].str.split('.', n=0, expand=True)
    titles = titles[0]
    return(titles)

In [173]:
def get_titles_from_names(df):
    df['Title']=get_title_last_name(df['Name'])
    df = df.drop(['Name'], axis=1)
    return(df)

In [174]:
def get_dummy_cats(df):
    return(pd.get_dummies(df, columns=['Title', 'Pclass', 'Sex', 'Embarked', 
                                      'Cabin', 'Cabin_letter']))

In [175]:
def get_cabin_letter(df):
    df['Cabin'].fillna('Z', inplace=True)
    df['Cabin_letter'] = df['Cabin'].str[0]
    return(df)

In [176]:
def process_data(df):
     # preprocess Title, Cabin, and embarked
    df = get_titles_from_names(df)
    df['Embarked'].fillna('S', inplace=True)
    df = get_cabin_letter(df)
    
    # Drop remaining feafures
    df = df.drop(['Ticket', 'Fare'], axis=1)
    
    df = get_dummy_cats(df)
    
    return(df)

In [177]:
proc_data = process_data(all_data)

In [178]:
proc_train = proc_data[proc_data['is_test']==0]
proc_test = proc_data[proc_data['is_test']==1]

In [179]:
for_age_train = proc_data.drop(['Survived', 'is_test'], axis=1).dropna(axis=0)

In [180]:
X_train_age = for_age_train.drop('Age', axis=1)
Y_train_age = for_age_train['Age']

In [181]:
tmodel = Sequential()
tmodel.add(Dense(128, input_dim=X_train_age.shape[1],  kernel_initializer='normal', bias_initializer='zeros'))
tmodel.add(Activation('relu'))

for i in range(1,9):
    tmodel.add(Dense(64, kernel_initializer='normal'))
    tmodel.add(Activation('relu'))
    tmodel.add(Dropout(0.25))

tmodel.add(Dense(1))
tmodel.add(Activation('linear'))

tmodel.compile(loss='mean_squared_error', optimizer='rmsprop')

In [None]:
tmodel.fit(X_train_age.values, Y_train_age.values, epochs=600, verbose=2)

Epoch 1/600
0s - loss: 675.7717
Epoch 2/600
0s - loss: 229.5006
Epoch 3/600
0s - loss: 223.1818
Epoch 4/600
0s - loss: 217.1494
Epoch 5/600
0s - loss: 208.8200
Epoch 6/600
0s - loss: 201.9015
Epoch 7/600
0s - loss: 197.5562
Epoch 8/600
0s - loss: 199.4541
Epoch 9/600
0s - loss: 191.1520
Epoch 10/600
0s - loss: 193.7116
Epoch 11/600
0s - loss: 186.3871
Epoch 12/600
0s - loss: 194.1180
Epoch 13/600
0s - loss: 180.7137
Epoch 14/600
0s - loss: 173.9018
Epoch 15/600
0s - loss: 178.4820
Epoch 16/600
0s - loss: 180.5377
Epoch 17/600
0s - loss: 181.6693
Epoch 18/600
0s - loss: 176.5200
Epoch 19/600
0s - loss: 171.2988
Epoch 20/600
0s - loss: 164.5610
Epoch 21/600
0s - loss: 167.0769
Epoch 22/600
0s - loss: 174.0913
Epoch 23/600
0s - loss: 165.7127
Epoch 24/600
0s - loss: 168.3370
Epoch 25/600
0s - loss: 174.0455
Epoch 26/600
0s - loss: 154.8354
Epoch 27/600
0s - loss: 149.6777
Epoch 28/600
0s - loss: 156.6963
Epoch 29/600
0s - loss: 165.4892
Epoch 30/600
0s - loss: 165.7026
Epoch 31/600
0s - l

0s - loss: 129.6545
Epoch 247/600
0s - loss: 117.6698
Epoch 248/600
0s - loss: 116.5961
Epoch 249/600
0s - loss: 127.0336
Epoch 250/600
0s - loss: 125.6138
Epoch 251/600
0s - loss: 131.3548
Epoch 252/600
0s - loss: 127.1394
Epoch 253/600
0s - loss: 124.7196
Epoch 254/600
0s - loss: 125.7685
Epoch 255/600
0s - loss: 123.6367
Epoch 256/600
0s - loss: 124.8903
Epoch 257/600
0s - loss: 124.8987
Epoch 258/600
0s - loss: 121.4839
Epoch 259/600
0s - loss: 118.7575
Epoch 260/600
0s - loss: 129.5814
Epoch 261/600
0s - loss: 114.9562
Epoch 262/600
0s - loss: 126.7437
Epoch 263/600
0s - loss: 126.4146
Epoch 264/600
0s - loss: 119.1236
Epoch 265/600
0s - loss: 129.9265
Epoch 266/600
0s - loss: 127.6185
Epoch 267/600
0s - loss: 116.3237
Epoch 268/600
0s - loss: 123.9433
Epoch 269/600
0s - loss: 115.6391
Epoch 270/600
0s - loss: 124.6918
Epoch 271/600
0s - loss: 130.8986
Epoch 272/600
0s - loss: 120.5481
Epoch 273/600
0s - loss: 127.0022
Epoch 274/600
0s - loss: 127.7788
Epoch 275/600
0s - loss: 113

In [132]:
train_data = proc_train
train_data.loc[train_data['Age'].isnull()]

to_pric = train_data.loc[train_data['Age'].isnull()].drop(['Age', 'Survived', 'is_test'], axis=1)
p = tmodel.predict(to_pric.values)

# p is of size (xxx,1)

train_data['Age'].loc[train_data['Age'].isnull()] = np.transpose(p)[0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [134]:
test_data = proc_test
test_data.loc[test_data['Age'].isnull()]

to_pric = test_data.loc[test_data['Age'].isnull()].drop(['Age', 'Survived', 'is_test'], axis=1)
p = tmodel.predict(to_pric.values)

# p is of size (xxx,1)

test_data['Age'].loc[test_data['Age'].isnull()] = np.transpose(p)[0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0_level_0,Survived,Age,SibSp,Parch,is_test,Title_Capt,Title_Col,Title_Don,Title_Dona,Title_Dr,...,Cabin_Z,Cabin_letter_A,Cabin_letter_B,Cabin_letter_C,Cabin_letter_D,Cabin_letter_E,Cabin_letter_F,Cabin_letter_G,Cabin_letter_T,Cabin_letter_Z
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [149]:
y_train = pd.get_dummies(train_data['Survived'])

In [153]:
X_train = train_data.drop(['Survived', 'is_test'], axis=1)

In [143]:
model = Sequential()
model.add(Dense(128, input_dim=X.shape[1], kernel_initializer='normal'))
model.add(Activation('relu'))

for i in range(15):
    model.add(Dense(128, kernel_initializer='normal'))
    model.add(Activation('relu'))
    model.add(Dropout(0.40))
    
model.add(Dense(2))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [146]:
model.fit(X_train.values, y_train.values, epochs=50, verbose=2)

Epoch 1/50
2s - loss: 0.6775 - acc: 0.6038
Epoch 2/50
0s - loss: 0.6682 - acc: 0.6162
Epoch 3/50
0s - loss: 0.6561 - acc: 0.6162
Epoch 4/50
0s - loss: 0.6073 - acc: 0.6162
Epoch 5/50
0s - loss: 0.5809 - acc: 0.7609
Epoch 6/50
0s - loss: 0.5354 - acc: 0.7991
Epoch 7/50
0s - loss: 0.5932 - acc: 0.7205
Epoch 8/50
0s - loss: 0.4956 - acc: 0.8462
Epoch 9/50
0s - loss: 0.4805 - acc: 0.8395
Epoch 10/50
0s - loss: 0.4800 - acc: 0.8182
Epoch 11/50
0s - loss: 0.5087 - acc: 0.8238
Epoch 12/50
0s - loss: 0.4686 - acc: 0.8339
Epoch 13/50
0s - loss: 0.4644 - acc: 0.8451
Epoch 14/50
0s - loss: 0.4340 - acc: 0.8440
Epoch 15/50
0s - loss: 0.4540 - acc: 0.8305
Epoch 16/50
0s - loss: 0.4436 - acc: 0.8361
Epoch 17/50
0s - loss: 0.4192 - acc: 0.8597
Epoch 18/50
0s - loss: 0.4310 - acc: 0.8608
Epoch 19/50
0s - loss: 0.4061 - acc: 0.8631
Epoch 20/50
0s - loss: 0.4131 - acc: 0.8519
Epoch 21/50
0s - loss: 0.4458 - acc: 0.8429
Epoch 22/50
0s - loss: 0.4391 - acc: 0.8429
Epoch 23/50
0s - loss: 0.4103 - acc: 0.86

<keras.callbacks.History at 0x12be69f98>

In [None]:
test_data.columns

In [None]:
p_survived = model.predict_classes(test_data.drop(['Survived', 'is_test'], axis=1).values)

In [None]:
p_survived

In [None]:
submission = pd.DataFrame()
submission['PassengerId']=test_data.index
submission['Survived']=p_survived

In [None]:
submission.shape

In [None]:
submission.to_csv('titanic_keras_cs.csv', index=False)