In [9]:
# conda install scikit-learn

import pandas as pd
import numpy as np

# Make numpy values easier to read.
np.set_printoptions(precision=3, suppress=True)

import tensorflow as tf
from tensorflow.keras import layers

In [10]:
titanic_dataset_dir = '../titanic_dataset/'
for dirname, _, filenames in os.walk(titanic_dataset_dir):
    for filename in filenames:
        print(os.path.join(dirname, filename))

../titanic_dataset/test.csv
../titanic_dataset/train.csv
../titanic_dataset/gender_submission.csv


In [11]:
from sklearn.model_selection import train_test_split

train_data = pd.read_csv(titanic_dataset_dir + "/train.csv")
train_data.head()

train_data, validation_data = train_test_split(train_data, test_size=0.1)

test_data = pd.read_csv(titanic_dataset_dir + "/test.csv")
test_data.head()
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
601,602,0,3,"Slabenoff, Mr. Petco",male,,0,0,349214,7.8958,,S
360,361,0,3,"Skoog, Mr. Wilhelm",male,40.0,1,4,347088,27.9,,S
865,866,1,2,"Bystrom, Mrs. (Karolina)",female,42.0,0,0,236852,13.0,,S
377,378,0,1,"Widener, Mr. Harry Elkins",male,27.0,0,2,113503,211.5,C82,C
284,285,0,1,"Smith, Mr. Richard William",male,,0,0,113056,26.0,A19,S


In [50]:

train_data['Sex'].replace([0,1],['female','male'],inplace=True)
train_data['Age'] = train_data['Age'].fillna(30)


x = np.mean(train_data['Age'],axis=0)
x

train_data['Embarked'].unique()


array(['S', 'C', 'Q', nan], dtype=object)

In [34]:
features = ["Pclass", "Sex", "SibSp", "Parch","Fare", "Age"]

featurized_dataset = pd.get_dummies(train_data[features])
featurized_dataset = np.asarray(featurized_dataset)
print(featurized_dataset)
labels = np.asarray(train_data["Survived"])

validation_featurized_dataset = pd.get_dummies(validation_data[features])
validation_featurized_dataset = np.asarray(validation_featurized_dataset)
validation_labels = np.asarray(validation_data["Survived"])


test_featurized_dataset = pd.get_dummies(test_data[features])
test_featurized_dataset = np.asarray(test_featurized_dataset)


[[ 3.   0.   0.  ... 30.   0.   1. ]
 [ 3.   1.   4.  ... 40.   0.   1. ]
 [ 2.   0.   0.  ... 42.   1.   0. ]
 ...
 [ 2.   0.   0.  ... 32.5  1.   0. ]
 [ 3.   0.   0.  ... 51.   0.   1. ]
 [ 3.   0.   0.  ... 25.   0.   1. ]]


In [68]:
model = tf.keras.Sequential([
  tf.keras.layers.Dense(1280, activation='sigmoid'),
  tf.keras.layers.Dense(128, activation='sigmoid'),
  tf.keras.layers.Dense(64, activation='sigmoid'),
  tf.keras.layers.Dropout(0.1),
  tf.keras.layers.Dense(1)
])

In [69]:
model.compile(loss = tf.keras.losses.MeanSquaredError(),
                      optimizer = tf.keras.optimizers.Adam())

In [70]:
model.fit(featurized_dataset, labels, epochs=100)

Train on 801 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100


<tensorflow.python.keras.callbacks.History at 0x7fc6e8de2a50>

In [71]:
predictions = model.predict(validation_featurized_dataset)
output = pd.DataFrame({'PassengerId': validation_data.PassengerId.tolist(), 'Survived': predictions.tolist()})
predictions_zero_one = np.where(predictions > 0.5, 1, 0)
output = pd.DataFrame({'Label': validation_data.Survived.tolist(), 'Prediction': predictions_zero_one.reshape(-1).tolist()})
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(output['Prediction'], output['Label'])

acc = np.sum(conf_mat.diagonal()) / np.sum(conf_mat)
acc

0.7222222222222222

In [72]:
test_predictions = model.predict(test_featurized_dataset)
test_predictions_zero_one = np.where(test_predictions > 0.5, 1, 0)
output = pd.DataFrame({'PassengerId': test_data.PassengerId.tolist(), 'Survived': test_predictions_zero_one.reshape(-1).tolist()})
print(output)

output.to_csv('submission_tf.csv', index=False)
print("Your submission was successfully saved!")

     PassengerId  Survived
0            892         0
1            893         0
2            894         0
3            895         0
4            896         0
..           ...       ...
413         1305         0
414         1306         1
415         1307         0
416         1308         0
417         1309         0

[418 rows x 2 columns]
Your submission was successfully saved!
