# Titanic: Machine Learning from Disaster

https://www.kaggle.com/c/titanic

In [23]:
import pandas as pd
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
import sklearn.preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from pandas.tools.plotting import scatter_matrix
%matplotlib inline

In [24]:
def load_data(fname):
    '''Loads data and cleans data (testing and training)'''
    
    scaler = StandardScaler()
    df = pd.read_csv(fname)
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    df['Cabin'] = df['Cabin'].fillna(0)
    df['Cabin'] = (df['Cabin'] != 0).astype(int)
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].value_counts().index[0])
    df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
    emb_cat = keras.utils.to_categorical(df['Embarked'].as_matrix())
    df['EmbS'] = emb_cat[:, 0]
    df['EmbC'] = emb_cat[:, 1]
    df['EmbQ'] = emb_cat[:, 2]
    
    df['Pclass'] = df['Pclass'].map({1: 0, 2: 1, 3: 2})
    pcl_cat = keras.utils.to_categorical(df['Pclass'].as_matrix())
    df['Plc0'] = pcl_cat[:, 0]
    df['Plc1'] = pcl_cat[:, 1]
    df['Plc2'] = pcl_cat[:, 2]
    
    mean_age = df['Age'].dropna().mean()
    std_age = df['Age'].dropna().std()
    df['Age'] = np.random.randint(mean_age - std_age, mean_age + std_age, size=len(df['Age']))
    df['Age'] = scaler.fit_transform(df['Age'].reshape(-1, 1))
    df['Fare'] = df['Fare'].fillna(df['Fare'].std())
    df['Fare'] = scaler.fit_transform(df['Fare'].reshape(-1, 1))
    df['SibSp'] = scaler.fit_transform(df['SibSp'].reshape(-1, 1))
    df['Parch'] = scaler.fit_transform(df['Parch'].reshape(-1, 1))

    del df['Pclass']
    del df['Embarked']
    del df['Name']
    del df['Ticket']
    return df

In [25]:
def save_data(fname, data):
    '''Saves data to file'''
    
    with open(fname, 'w') as f:
        f.write('PassengerId,Survived\n')
        for pid, result in data:
            f.write('{0},{1}\n'.format(int(pid), int(result)))

In [26]:
data = load_data('train.csv')
data.head()



Unnamed: 0,PassengerId,Survived,Sex,Age,SibSp,Parch,Fare,Cabin,EmbS,EmbC,EmbQ,Plc0,Plc1,Plc2
0,1,0,0,0.319565,0.432793,-0.473674,-0.502445,0,1.0,0.0,0.0,0.0,0.0,1.0
1,2,1,1,-0.0404,0.432793,-0.473674,0.786845,1,0.0,1.0,0.0,1.0,0.0,0.0
2,3,1,1,-0.520354,-0.474545,-0.473674,-0.488854,0,1.0,0.0,0.0,0.0,0.0,1.0
3,4,1,1,-1.120296,0.432793,-0.473674,0.42073,1,1.0,0.0,0.0,1.0,0.0,0.0
4,5,0,0,-0.280377,-0.474545,-0.473674,-0.486337,0,1.0,0.0,0.0,0.0,0.0,1.0


In [27]:
pdata = load_data('test.csv')
pdata.head()



Unnamed: 0,PassengerId,Sex,Age,SibSp,Parch,Fare,Cabin,EmbS,EmbC,EmbQ,Plc0,Plc1,Plc2
0,892,0,1.659587,-0.49947,-0.400248,-0.499198,0,0.0,0.0,1.0,0.0,0.0,1.0
1,893,1,0.265073,0.616992,-0.400248,-0.514063,0,1.0,0.0,0.0,0.0,0.0,1.0
2,894,0,-1.256215,-0.49947,-0.400248,-0.465885,0,0.0,0.0,1.0,0.0,1.0,0.0
3,895,0,1.279265,-0.49947,-0.400248,-0.48426,0,1.0,0.0,0.0,0.0,0.0,1.0
4,896,1,-0.242023,0.616992,0.619896,-0.419275,0,1.0,0.0,0.0,0.0,0.0,1.0


In [28]:
X = data.as_matrix()[:, 2:]
y = data.as_matrix()[:, 1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [29]:
def create_model():
    '''Creates model'''
    
    model = Sequential()
    model.add(Dense(12, input_dim=12, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [30]:
model = create_model()
model.fit(X_train, y_train, epochs=100)
score = model.evaluate(X_test, y_test, batch_size=25)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [31]:
print(score)

[0.49883954201996661, 0.77653630951929353]


In [32]:
prediction = model.predict(pdata.as_matrix()[:, 1:])
prediction = (prediction > 0.5).astype(int)
prediction = prediction.reshape(prediction.shape[0])

In [33]:
result = zip(pdata.as_matrix()[:, 0], prediction)
save_data('result.csv', result)