In [2]:
"""
This script imports the Kaggle Titanic dataset in .csv form, extracts a new feature from name data, imputates missing
values using an autoencoder initialized neural network set for regression. Lastly it builds an autoencoder initalized
deep neural network to determine if an individual in the test set is likely to have survived. Keras is used for all NNs.

Originally I randomly partitioned off 91 of 891 examples for validation testing, however, training accuracy was severely 
affected and test accuracy was very erratic depending on which 91 were chosen, i.e. insufficent data. As my primary goals 
were to practice using Pandas and neural networks, I did not look at other ways to deal with this. However, I ran the 
autoencoder on the entire dataset during training, including the test set. This obviously risks overfitting the test set,
but I wanted to see if it would help, as that would suggest that mixing unlabeled and labeled data in such a fashion may 
be useful. 

There is also a massive bug- monitoring training via the verbosity setting on Keras results in Jupyter bugs. The latest
update fixed the crashing, but now the latency is horrendous. To work around this, multiple models with increasing epochs
are run to easily pick the best one, at the cost of a significant increase in complexity. When autoencoder initialized 
layers are used, I accidentally set it to shallow copy, and as such it is trained by every model that touches it. For some
reason this helps by a few percent, and fixing it makes it worse, even with more epochs during the AE initialization. This
may help counter the vanishing gradient problem in a manner reminiscent of deep belief networks, but further research on 
larger, and more varied data sets with more test runs are necessary.

I feel the need to include the disclaimer that hammering everything with complex models and large deep nets
may not always be optimal, and that this script is a little haphazard in that regard. Compared to some other public SVM 
and random forest based models on the leaderboard, this is exceptionally slow even with GPU acceleration. Time permitting,
I want to explore combining these methods together in a committee/ensemble, as that has ranked well on the leaderboard.
"""

import csv
import numpy
import pandas

dfTrain=pandas.read_csv('train.csv')
dfTest=pandas.read_csv('test.csv')

print('Training Examples:',len(dfTrain))
print('Test Examples:',len(dfTest))
dfAll=dfTrain.append(dfTest) 
print('All Examples:',len(dfAll))

Training Examples: 891
Test Examples: 418
All Examples: 1309


In [3]:
#Determine social ranking from names to create an additional feature
dfAll['Title']=dfAll['Name'].apply( lambda x:( (x.split(', ')[1]).split(' ')[0] ) )

dfAll['Title']=dfAll['Title'].replace(to_replace=['Don.','Rev.','Master.','Dr.','Col.','Capt.',
                                                      'Major.','Jonkheer.','Lady.','the','Sir.','Dona.'], value=1)                                          
dfAll['Title']=dfAll['Title'].replace(to_replace=['Miss.','Mlle.','Ms.','Mrs.','Mme.','Mr.'],value=0)

In [4]:
#Simple imputation, based on this analysis: https://www.kaggle.com/mrisdal/titanic/exploring-survival-on-the-titanic
dfAll['Fare']=dfAll['Fare'].replace(to_replace=[dfTest['Fare'][1044-892]],value=8.05)
#Numpy does not allow direct comparison to np.nan. This is an admittedly ugly workaround.
NAN=(dfTrain['Embarked'][829])
dfAll['Embarked']=dfTrain['Embarked'].replace(to_replace=[NAN],value='C')



In [5]:
#one-hot encoding
dfAll=pandas.get_dummies(dfAll, columns=['Sex'])
dfAll=pandas.get_dummies(dfAll, columns=['Embarked'])


In [6]:
#Normalization
def normalize(df,strlab):
    df[strlab]=(df[strlab].map(lambda x: float(float(float(x) - float(numpy.min(df[strlab]))) /
                                                     float(float(numpy.max(df[strlab])) - float(numpy.min(df[strlab]))))))
normalize(dfAll,'Age')
normalize(dfAll,'Parch')
normalize(dfAll,'Pclass')
normalize(dfAll,'Fare')
normalize(dfAll,'SibSp')


In [7]:
#Prepare to imputate age.
dfNoAge=dfAll[numpy.isnan(dfAll['Age'])]
dfAge=dfAll[(numpy.isnan(dfAll['Age']))==False]
print('Has age:',len(dfAge))
print('Missing age:',len(dfNoAge))

noAgeAE=dfAll.drop(['Age','Cabin','PassengerId','Name','Survived','Ticket'],axis=1).as_matrix()
ageTrainX=dfAge.drop(['Age','Cabin','PassengerId','Name','Survived','Ticket'],axis=1).as_matrix()
ageTrainY=dfAge.as_matrix(columns=['Age'])

Has age: 1046
Missing age: 263


In [8]:
#import Keras 
import numpy as np
np.random.seed(1337)  # for reproducibility

from keras.datasets import mnist
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import SGD, Adam, RMSprop
from keras.utils import np_utils

Using TensorFlow backend.


In [22]:
#Python does not have a Multiple Imputation by Chained Equations library.
#While I could export to R and use MICE I wanted to try neural net regression.
#As my research frequently involves autoencoders, I used an AE to initialize the hidden before running a normal NN

batch_size = 20
nb_classes = 2
nb_epochs = 20
models=[]

#Runs multiple models with increasing epochs, and with the option to pick an earlier one.
#Not optimal, but helps deal with bugs and latency in jupyter notebook. Verbosity is disabled for this reason.
for x in range(0,3):
    nb_epoch=nb_epochs*x
    print(nb_epoch)
    model = Sequential()
    model.add(Dense(9, input_shape=(10,)))
    model.add(Activation('tanh'))
    model.add(Dense(10))
    model.add(Activation('linear'))

    model.compile(loss='mean_absolute_error',
              optimizer=RMSprop(),
              metrics=['accuracy'])

    history = model.fit(noAgeAE, noAgeAE,
                    batch_size=batch_size, nb_epoch=nb_epoch,
                    verbose=0, validation_data=(noAgeAE, noAgeAE))

    score = model.evaluate(noAgeAE, noAgeAE, verbose=1)
    models.append(model)
    print('Test score:', score[0])
    print('Test accuracy:', score[1])
    print('')
print('\nDone')


0




  32/1309 [..............................] - ETA: 6sTest score: 0.523910809101
Test accuracy: 0.0412528647794

20
Test accuracy: 0.815126050192

40
  32/1309 [..............................] - ETA: 0sTest score: 0.0134952006523
Test accuracy: 0.597402596947


Done


In [23]:
AELayer=model #Selects last model

In [24]:
batch_size = 1
nb_classes = 2
nb_epochs = 20
models=[]

#Massive bug here in which each run is not independent, and the initialized layers get backproped over on every run.
#However, this seems to increase performance, and may be worth looking into.

for x in range(0,3):
    nb_epoch=nb_epochs*x
    print(nb_epoch)
    model = Sequential()
    model.add(AELayer.layers[0])
    model.add(AELayer.layers[1])
    model.add(Dense(10))
    model.add(Activation('relu'))
    model.add(Dense(1))
    model.add(Activation('linear'))

    model.compile(loss='mean_squared_error',
              optimizer=RMSprop(),
              metrics=['accuracy'])

    history = model.fit(ageTrainX, ageTrainY,
                    batch_size=batch_size, nb_epoch=nb_epoch,
                    verbose=0, validation_data=(ageTrainX, ageTrainY))

    score = model.evaluate(ageTrainX, ageTrainY, verbose=1)
    models.append(model)
    print('Test score:', score[0])
    print('Test accuracy:', score[1])
    print('')
print('\nDone')

0




  32/1046 [..............................] - ETA: 5sTest score: 0.492518335809
Test accuracy: 0.000956022944551

20
  32/1046 [..............................] - ETA: 0sTest score: 0.022460946247
Test accuracy: 0.0019120458891

40
  32/1046 [..............................] - ETA: 0sTest score: 0.0212936844128
Test accuracy: 0.0019120458891


Done


In [25]:
finmod=model #pick last model

In [26]:
#imputate age from NN model

dfNoAge=dfNoAge.drop(['Age'],axis=1)
matNoAge=dfNoAge.drop(['Cabin','PassengerId','Name','Survived','Ticket'], axis=1)

ages=finmod.predict(matNoAge.as_matrix(),batch_size=1)

dfNoAge['Age']=ages

In [27]:
#Recombine data, and resplit to build the classifier
dfRecombined=dfNoAge.append(dfAge)
dfRecombined=dfRecombined.drop(['Cabin','Name','Ticket'],axis=1)
MatAEClass=dfRecombined.drop(['Survived','PassengerId'],axis=1).as_matrix()

dfRecTest=dfRecombined[numpy.isnan(dfRecombined['Survived'])]
dfRecTest=dfRecTest.drop(['Survived'],axis=1)
dfRecTrain=dfRecombined[numpy.isnan(dfRecombined['Survived'])==False]


In [28]:
"""
I tried splitting off 91 examples for validation but training accuracy was hit pretty hard,
and validation accuracy was extremely erratic depending on how the random sampling went.
Since nothing is going right I decided to risk deliberately overfitting the test set
and decided to initialize the first layer of a deep neural net by running an AE on all the data.
This might be a viable way to use data with missing labels.
"""

batch_size = 1
nb_classes = 2
nb_epochs = 20
models=[]

for x in range(0,3):
    nb_epoch=nb_epochs*x
    print(nb_epoch)
    model = Sequential()
    model.add(Dense(7, input_shape=(11,)))
    model.add(Activation('relu'))
    model.add(Dense(11))
    model.add(Activation('linear'))

    model.compile(loss='mean_squared_error',
              optimizer=RMSprop(),
              metrics=['accuracy'])

#Verbosity disabled since it triggers a juptyer notebook bug and crashes the training

    history = model.fit(MatAEClass, MatAEClass,
                    batch_size=batch_size, nb_epoch=nb_epoch,
                    verbose=0, validation_data=(MatAEClass, MatAEClass))

    score = model.evaluate(MatAEClass, MatAEClass, verbose=1)
    models.append(model)
    print('Test score:', score[0])
    print('Test accuracy:', score[1])
    print('')
print('\nDone')

0




  32/1309 [..............................] - ETA: 7sTest score: 0.593811725596
Test accuracy: 0.110007639414

20
  32/1309 [..............................] - ETA: 0sTest score: 0.00279010708317
Test accuracy: 0.437738731401

40
Test accuracy: 0.545454545409


Done


In [29]:
AEModel=model #Pick model

In [30]:
#Reseperate training and test data; format labels
finTrainX=dfRecTrain.drop(['Survived','PassengerId'],axis=1).as_matrix()
dfRecTrain=pandas.get_dummies(dfRecTrain, columns=['Survived'])
finTrainY=dfRecTrain.as_matrix(columns=['Survived_0.0','Survived_1.0'])

In [31]:
"""
Again, theres the massive bug where the first hidden layer is shared by, and subsequently trained by, all the models.
However this has given me the best test accuracy so far, and fixing this bug drops accuracy by a few percent at least.
Given that deep nets tend to have vanishing gradients further up this may be a way to counter that, similar to how AE based
deep belief networks are used.

I've messed around a lot with varying layers, hidden units, activations, optimizers, loss functions, dropout 
and regularization, and while the AE initialization helped, getting further with just NN/DNN architectures will
require either luck or experience. Other public submissions further up the leaderboard have used genetic algorithms and
multiple models in a committee. Integrating elements of this approach may be helpful.
"""
batch_size = 1
nb_classes = 2
nb_epochs = 10 
models=[]

for x in range(0,20):
    nb_epoch=nb_epochs*x
    print(nb_epoch)
    model = Sequential()
    model.add(AEModel.layers[0])
    model.add(AEModel.layers[1])
    #model.add(Dense(100, input_shape=(11,)))
    #model.add(Activation('relu'))
    #model.add(Dropout(.2))
    model.add(Dense(10))
    model.add(Activation('relu'))
    #model.add(Dropout(.2))
    model.add(Dense(2))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy',
              optimizer=Adam(),
              metrics=['accuracy'])

#Verbosity disabled since it triggers a juptyer notebook bug and crashes the training

    history = model.fit(finTrainX, finTrainY,
                    batch_size=batch_size, nb_epoch=nb_epoch,
                    verbose=0, validation_data=(finTrainX, finTrainY))

    score = model.evaluate(finTrainX, finTrainY, verbose=1)
    models.append(model)
    print('Test score:', score[0])
    print('Test accuracy:', score[1])
    print('')
print('\nDone')

0




 32/891 [>.............................] - ETA: 5sTest score: 0.672483369223
Test accuracy: 0.616161616295

10
 32/891 [>.............................] - ETA: 0sTest score: 0.413497737562
Test accuracy: 0.826038158435

20
 32/891 [>.............................] - ETA: 0sTest score: 0.396185207447
Test accuracy: 0.840628506827

30
 32/891 [>.............................] - ETA: 0sTest score: 0.395122745038
Test accuracy: 0.839506173508

40
 32/891 [>.............................] - ETA: 0sTest score: 0.38853517265
Test accuracy: 0.838383838585

50
Test accuracy: 0.838383839053

60
Test accuracy: 0.843995510194

70
Test accuracy: 0.849607183609

80
Test accuracy: 0.846240180911

90
Test accuracy: 0.847362514029

100
Test accuracy: 0.847362514698

110
Test accuracy: 0.845117845118

120
Test accuracy: 0.847362514029

130
 32/891 [>.............................] - ETA: 0sTest score: 0.353726279488
Test accuracy: 0.854096522101

140
 32/891 [>.............................] - ETA: 0sTest sco

In [33]:
#Converts things to CSV for Kaggle submission

model=models[-1]

outs = model.predict_classes(dfRecTest.drop(['PassengerId'],axis=1).as_matrix())
print(outs)

output = []
output.append(["PassengerId","Survived"])
matTestId=dfRecTest.as_matrix(columns=['PassengerId'])
for x in range(0,len(outs)):
    output.append([matTestId[x][0],int(outs[x])])

 32/418 [=>............................] - ETA: 0s[0 1 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 1 1 0 0 0 1 0 1 1 1 0 1
 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 1 1 0 0 1 1 0 0 0 1 0 0 0 0 1 1 0 1 1 0 0 0
 1 1 0 1 0 0 1 0 0 0 1 1 0 1 1 0 1 0 1 0 1 0 0 0 0 0 0 0 1 1 1 0 1 0 1 1 1
 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 0 1 0 0 1 0 0 1 1 1 0 0 0 0 0
 0 0 0 1 1 0 1 1 0 1 1 1 0 1 0 0 0 1 0 1 0 0 1 0 1 1 1 0 0 0 1 0 0 0 1 1 0
 0 1 1 0 1 0 1 0 0 0 0 1 0 0 0 0 0 1 1 1 1 0 0 1 0 1 1 0 0 0 0 0 1 0 0 0 1
 1 0 0 0 1 1 0 0 0 0 1 1 1 1 0 1 1 0 0 0 1 0 0 0 0 0 1 1 1 0 0 0 0 0 1 1 0
 0 0 0 0 0 0 1 1 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 1 1 0 0 0 1 1 0 0 0 1 0 1 1
 0 1 1 0 1 1 1 0 0 1 0 0 1 1 0 0 0 0 0 0 1 0 0 0 1 0 1 1 0 0 1 0 1 0 0 1 0
 1 1 0 0 0 0 1 1 1 1 0]


In [64]:
import pandas as pd
output2 = pd.DataFrame(output, columns=['PassengerId','Survived' ])

In [65]:
output2

Unnamed: 0,PassengerId,Survived
0,PassengerId,Survived
1,902,0
2,914,1
3,921,0
4,925,1
5,928,1
6,931,0
7,933,0
8,939,0
9,946,0


In [68]:
output2.to_csv(path_or_buf='outputAENNRAEDNN.csv',index=False,columns=['PassengerId','Survived'])
