In [1]:
import pandas as pd
import seaborn as sns
import numpy as np

# Data Prepare

In [67]:
df_titan = pd.read_csv('titanic_train.csv')
titan_y = df_titan.Survived
titan_x = df_titan.drop(columns=['Survived', 'PassengerId','Ticket','Cabin'])

In [79]:
from sklearn.model_selection import train_test_split  
X_train, X_test, Y_train, Y_test = train_test_split(
    titan_x, titan_y, test_size=0.3, random_state=1) 

In [95]:
def AddFeature(df):
    df['noble'] = df['Name'].str.extract(r'\b(\w+)\.')
    df.loc[~df['noble'].isin(['Mr','Mrs','Miss','Miss','Ms']),'noble'] = 1
    df.loc[df['noble'].isin(['Mr','Mrs','Miss','Miss','Ms']),'noble'] = 0
    df = df.drop(columns=['Name'])
    df['Family'] = df['Parch'] + df['SibSp']
    df.loc[df['Family']>0, 'Family'] = 1
    df.loc[df['Family']==0, 'Family'] = 0
    df = df.drop(columns=['SibSp', 'Parch'])
    return df

In [96]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import FunctionTransformer
imp1 = SimpleImputer(strategy='mean')
imp2 = SimpleImputer(strategy='most_frequent')
enc = OneHotEncoder(sparse=False)
scaler = MinMaxScaler()
af = FunctionTransformer(AddFeature)
tf = ColumnTransformer([('impa', imp1, [2,3]),('impb', imp2, [4])], remainder='passthrough')
tf1 = ColumnTransformer([('enca', enc, [2,3,4])], remainder='passthrough')
tf2 = ColumnTransformer([('scalera', scaler, [8,9])], remainder='passthrough')
pca = PCA(n_components=8)

In [80]:
from sklearn.pipeline import Pipeline
steps = [('add', af),
         ('tf', tf),
         ('tf1', tf1),
         ('tf2', tf2), 
         ('pca', pca)]
pipe = Pipeline(steps)

In [81]:
X_train = pipe.fit_transform(X_train)

In [97]:
#X_train[0]
#X_train.astype(float)

# Tensorflow

In [9]:
#set to use tensorflow 2.0 as the backend
#this is designed to work on Colab
%tensorflow_version 2.x

In [83]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense

In [84]:
# define the network model
network = Sequential()
network.add(Dense(128, activation='relu', input_dim=8))
network.add(Dense(256, activation='relu'))
network.add(Dense(1, activation='sigmoid'))

In [85]:
# compile your model
network.compile(loss='binary_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])

In [86]:
network.fit(X_train,Y_train, epochs=50, batch_size=16) #X_train.astype(float) if skipping scaling

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7ff20518c510>

## Making Pipeline

In [87]:
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

In [88]:
def create_model():
    network = Sequential()
    network.add(Dense(128, activation='relu', input_dim=8))
    network.add(Dense(256, activation='relu'))
    network.add(Dense(1, activation='sigmoid'))
    network.compile(loss='binary_crossentropy',
                optimizer='adam',#sgd
                metrics=['accuracy'])
    return network
nn = KerasClassifier(build_fn=create_model, epochs=50, batch_size=16, verbose=0)

  # Remove the CWD from sys.path while we load stuff.


In [54]:
# if skip scaling
def toFloat(df):
    return df.astype(float)
tF = FunctionTransformer(toFloat)

In [89]:
from sklearn.pipeline import Pipeline
steps = [('add', af),
         ('tf', tf),
         ('tf1', tf1),
         ('tf2', tf2),
         ('pca', pca),
         ('nn', nn)]
pipe = Pipeline(steps)

In [90]:
pipe.fit(titan_x, titan_y)

Pipeline(steps=[('add',
                 FunctionTransformer(func=<function AddFeature at 0x7ff205b4ec20>)),
                ('tf',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('impa', SimpleImputer(),
                                                  [2, 3]),
                                                 ('impb',
                                                  SimpleImputer(strategy='most_frequent'),
                                                  [4])])),
                ('tf1',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('enca',
                                                  OneHotEncoder(sparse=False),
                                                  [2, 3, 4])])),
                ('tf2',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('scalera', MinMaxScaler(),
                   

In [91]:
pipe.score(X_test, Y_test)

0.8097015023231506

In [92]:
from sklearn.model_selection import KFold, cross_val_score
kf = KFold()
cross_val_score(pipe, titan_x, titan_y, cv=kf )

array([0.79888266, 0.82584268, 0.81460673, 0.78651685, 0.85955054])

In [93]:
titan_test = pd.read_csv('titanic_test.csv')
pred = pipe.predict(titan_test)

In [94]:
np.savetxt("test.csv", pred, delimiter=",")