In [9]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf
import re
import copy
from tensorflow import keras
# it's a library that we work with plotly
import plotly.offline as py 
py.init_notebook_mode(connected=True) # this code, allow us to work with offline plotly version
import plotly.graph_objs as go # it's like "plt" of matplot
import plotly.tools as tls # It's useful to we get some tools of plotly
import warnings # This library will be used to ignore some warnings
from collections import Counter # To do counter of some features
accuracy_list = []

In [16]:
# 1. data preprocessing
train_df = pd.read_csv("./kaggle_train.csv", index_col=0)
test_df = pd.read_csv("./kaggle_test.csv")
# The name of passenger and Ticket are useless when we want to use data to train my model
train_df.pop("Name")
test_df.pop("Name")
train_df.pop("Ticket")
test_df.pop("Ticket")
train_df["Cabin"] = train_df["Cabin"].fillna("Unknown")
test_df["Cabin"] = test_df["Cabin"].fillna("Unknown")
train_df["Embarked"] = train_df["Embarked"].fillna("Unknown")
test_df["Embarked"] = test_df["Embarked"].fillna("Unknown")
train_df["Age"] = train_df["Age"].fillna(28)
test_df["Age"] = test_df["Age"].fillna(28)
train_df["alone"] = train_df["SibSp"].apply(lambda x : "Y" if x > 0 else "N")
test_df["alone"] = test_df["SibSp"].apply(lambda x : "Y" if x > 0 else "N")
train_df["Cabin"] = train_df["Cabin"].apply(lambda x : "Unknown" if x == "Unknown" else re.findall("^\w", x)[0])
test_df["Cabin"] = test_df["Cabin"].apply(lambda x : "Unknown" if x == "Unknown" else re.findall("^\w", x)[0])
# train_df.drop(train_df[np.isnan(train_df["Age"])].index, inplace=True)
new_train_df = copy.deepcopy(train_df)
y_train = train_df.pop('Survived')
# y_eval = test_df.pop('Survived')

<h1 align="center"> Data format </h1>

In [None]:
from sklearn.model_selection import train_test_split
x_train_all, x_test, y_train_all, y_test = train_test_split(
    train_df, y_train, random_state = 7)
x_train, x_valid, y_train, y_valid = train_test_split(
    x_train_all, y_train_all, random_state = 11)


In [None]:
categorical_columns = ["Sex", "SibSp", "Parch", "Pclass", "Cabin", "Embarked", "alone"]
numeric_columns = ["Age", "Fare"]
feature_columns = []
for categorical_column in categorical_columns:
    vocab = train_df[categorical_column].unique()
    print(categorical_column, vocab)
    feature_columns.append(tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list(
        categorical_column, vocab)))

for categorical_column in numeric_columns:
    feature_columns.append(tf.feature_column.numeric_column(categorical_column, dtype=tf.float32))

In [None]:
def make_dataset(data_df, label_df, epochs = 10, 
                 shuffle = True, batch_size = 32):
    dataset = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
    if shuffle:
        dataset = dataset.shuffle(10000)
    dataset = dataset.repeat(epochs).batch(batch_size)
    return dataset

<h1 align="center"> <font color="black" align="center">Estimator Mehod</font> </h1>

In [None]:
linear_out_dir = "./Titanic_linear_model"
if not os.path.exists(linear_out_dir):
    os.mkdir(linear_out_dir)
linear_estimator = tf.compat.v1.estimator.LinearClassifier(
    model_dir = linear_out_dir,
    n_classes=2,
    feature_columns=feature_columns)
linear_estimator.train(input_fn = lambda : make_dataset(x_train_all, y_train_all, epochs=100))

In [None]:
history = linear_estimator.evaluate(input_fn = lambda : make_dataset(x_test, y_test, epochs=1, shuffle = False))
accuracy_list.append(history["accuracy"])

In [None]:
dnn_out_dir = "./Tatanic_dnn_model"
if not os.path.exists(dnn_out_dir):
    os.mkdir(dnn_out_dir)
dnn_estimator = tf.compat.v1.estimator.DNNClassifier(
    model_dir=dnn_out_dir,
    n_classes=2,
    feature_columns=feature_columns,
    hidden_units=[100, 100],
    activation_fn=tf.nn.relu,
    optimizer="Adam")
dnn_estimator.train(input_fn = lambda : make_dataset(
                    x_train_all, y_train_all, epochs=100))

In [None]:
history = dnn_estimator.evaluate(input_fn= lambda : make_dataset(
                       x_test, y_test, epochs=1, shuffle = False))
accuracy_list.append(history["accuracy"])

<h1 align="center"> Model Method </h1>

In [None]:
def create_model():
    model = keras.models.Sequential([
    keras.layers.DenseFeatures(feature_columns),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(2, activation="softmax"),])
    model.compile(loss='sparse_categorical_crossentropy', 
                  optimizer = keras.optimizers.SGD(lr=0.01), 
                  metrics = ["accuracy"])

    return model
model = create_model()

In [None]:
train_dataset = make_dataset(x_train, y_train, epochs=100)
valid_dataset = make_dataset(x_valid, y_valid, epochs=100)
logdir = './Tatanic_model'
if not os.path.exists(logdir):
    os.mkdir(logdir)
output_model_file = os.path.join(logdir, "train_model.h5")
model.fit(train_dataset, validation_data=valid_dataset, 
                    steps_per_epoch=15, validation_steps=8, 
                    epochs = 100)

In [None]:
history = model.evaluate(make_dataset(x_test, y_test, epochs=1, shuffle = False))
accuracy_list.append(history[1])

In [None]:
trace0 = go.Bar(
            x = np.array(['Linear'], dtype=object),
            y = np.array([accuracy_list[0]], dtype=object),
            name='LinearEstimator'
    )

trace1 = go.Bar(
            x = np.array(['DNN'], dtype=object),
            y = np.array([accuracy_list[1]], dtype=object),
            name='DNNEstimator'
    )

trace2 = go.Bar(
            x = np.array(['Model'], dtype=object),
            y = np.array([accuracy_list[2]], dtype=object),
            name='Model'
    )

data = [trace0, trace1, trace2]


layout = go.Layout(
    yaxis=dict(
        title='accuracy'
    ),
    xaxis=dict(
        title='Various models'
    ),
    title='Accuracy Comparison Among Various Models'
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='grouped-bar')

In [None]:
## For that, we would use LinearEstimator as main model to help us to predict Tatanic Survival

In [None]:
feature_list = []
features = ['Age', 'Cabin', 'Embarked', 'Fare', 'Parch', 'Pclass', 'Sex', 'SibSp', 'alone']
for feature in features:
    feature_list.append(test_df[feature])
predictor = model.predict(feature_list)   

In [None]:
def get_data(data):
    survived = []
    for i in data:
        if (i[0] > i[1]):
            survived.append(0)
        else:
            survived.append(1)
    return survived
test_df["Survived"] = get_data(predictor)

In [None]:
trace0 = go.Bar(
            x = np.array(['Dead'], dtype=object),
            y = np.array([test_df["Survived"].value_counts()[0]], dtype=object),
            name='Dead'
    )

trace1 = go.Bar(
            x = np.array(['Survive'], dtype=object),
            y = np.array([test_df["Survived"].value_counts()[1]], dtype=object),
            name='Survive'
    )

data = [trace0, trace1]


layout = go.Layout(
    yaxis=dict(
        title='Number'
    ),
    xaxis=dict(
        title='Various situation'
    ),
    title='Prediction for Tiatanic Survival'
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='grouped-bar')

<h1 align="center"> this is Survived predict used to submit in Kaggle</h1>

In [None]:
SubmissionObj = test_df[["PassengerId", "Survived"]]

In [None]:
SubmissionObj.to_csv("Submission_list.csv", index=False)