In [1]:
#Liangco, Ma. Karizza F.
#COE005 - Prelims

#Importing Needed Libraries
import pandas as pd
import numpy as np
import random as rnd
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

#Importing Libraries for Machine Learning
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
#Importing Gathered Dataset from Kaggle
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
combine = [train, test]

In [3]:
#Display of Initial Dataset
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
#Preparing the Data and Dropping other Features

#Checking the Shape of the Dataset
print("Before", train.shape, test.shape, combine[0].shape, combine[1].shape)

#Dropping Irrelevant Columns
train = train.drop(['Ticket', 'Cabin'], axis=1)
test = test.drop(['Ticket', 'Cabin'], axis=1)
combine = [train, test]

#Checking the Shape of the Dataset
"After", train.shape, test.shape, combine[0].shape, combine[1].shape

Before (891, 12) (418, 11) (891, 12) (418, 11)


('After', (891, 10), (418, 9), (891, 10), (418, 9))

In [5]:
#Retaining the new Title feature for Model Training later
for dataset in combine:
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

pd.crosstab(train['Title'], train['Sex'])

Sex,female,male
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,0,1
Col,0,2
Countess,1,0
Don,0,1
Dr,1,6
Jonkheer,0,1
Lady,1,0
Major,0,2
Master,0,40
Miss,182,0


In [6]:
#Further preparation of Data
#Classification of Titles as Common or Rare

for dataset in combine:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')


In [7]:
#Further preparation of Data
#Converting the Titles in Ordinal Form 

title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
for dataset in combine:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)

In [8]:
#Dropping the Name and PassengerId

train = train.drop(['Name', 'PassengerId'], axis=1)
test = test.drop(['Name'], axis=1)
combine = [train, test]
train.shape, test.shape

((891, 9), (418, 9))

In [9]:
#Converting to a New Feature
#Female as 1 and Male as 0

for dataset in combine:
    dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

In [10]:
#Preparing an Empty Array

guess_ages = np.zeros((2,3))

In [11]:
#Iteration for Sex and Pclass
#to calculate guessed values for Age and the combinations

for dataset in combine:
    for i in range(0, 2):
        for j in range(0, 3):
            guess_df = dataset[(dataset['Sex'] == i) & \
                                  (dataset['Pclass'] == j+1)]['Age'].dropna()

            # age_mean = guess_df.mean()
            # age_std = guess_df.std()
            # age_guess = rnd.uniform(age_mean - age_std, age_mean + age_std)

            age_guess = guess_df.median()

            # Convert random age float to nearest .5 age
            guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5
            
    for i in range(0, 2):
        for j in range(0, 3):
            dataset.loc[ (dataset.Age.isnull()) & (dataset.Sex == i) & (dataset.Pclass == j+1),\
                    'Age'] = guess_ages[i,j]

    dataset['Age'] = dataset['Age'].astype(int)

In [12]:
train['AgeBand'] = pd.cut(train['Age'], 5)

In [13]:
for dataset in combine:    
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age']

In [14]:
train = train.drop(['AgeBand'], axis=1)
combine = [train, test]

In [15]:
#Creation of New Features "Family Size"
for dataset in combine:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

In [16]:
#Creation of New Features "A person is Alone"

for dataset in combine:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

In [17]:
#Further Dropping other Features in favor of IsALone 

train = train.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)
test = test.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)
combine = [train, test]

In [18]:
#Combining Pclass and Age

for dataset in combine:
    dataset['Age*Class'] = dataset.Age * dataset.Pclass

In [19]:
#Seeing the most common occurance on Embark

freq_port = train.Embarked.dropna().mode()[0]

In [20]:
for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].fillna(freq_port)
    
train[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Embarked,Survived
0,C,0.553571
1,Q,0.38961
2,S,0.339009


In [21]:
#Converting into a Numeric Form for the Model

for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,IsAlone,Age*Class
0,0,3,0,1,7.25,0,1,0,3
1,1,1,1,2,71.2833,1,3,0,2
2,1,3,1,1,7.925,0,2,1,3
3,1,1,1,2,53.1,0,3,0,2
4,0,3,0,2,8.05,0,1,1,6


In [22]:
test['Fare'].fillna(test['Fare'].dropna().median(), inplace=True)

In [23]:
train['FareBand'] = pd.qcut(train['Fare'], 4)

In [24]:
#Converting the Fare Feature into Ordinal Values

for dataset in combine:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)

train= train.drop(['FareBand'], axis=1)
combine = [train, test]
    
train.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,IsAlone,Age*Class
0,0,3,0,1,0,0,1,0,3
1,1,1,1,2,3,1,3,0,2
2,1,3,1,1,1,0,2,1,3
3,1,1,1,2,3,0,3,0,2
4,0,3,0,2,1,0,1,1,6
5,0,3,0,1,1,2,1,1,3
6,0,1,0,3,3,0,1,1,3
7,0,3,0,0,2,0,4,0,0
8,1,3,1,1,1,0,3,0,3
9,1,2,1,0,2,1,3,0,0


In [25]:
#I Chose KNN and Decision Tree for Machine Learning Algorithm
#Putting "Survived" in Y_train while dropping in X_train

X_train = train.drop("Survived", axis=1)
Y_train = train["Survived"]
X_test  = test.drop("PassengerId", axis=1).copy()

#Displaying the Shape

X_train.shape, Y_train.shape, X_test.shape

((891, 8), (891,), (418, 8))

In [26]:
#K-Nearest Neighbor Machine Learning Algorithm
#Training the K-Nearest Machine Learning Algorithm

knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)

#Evaluation of the K-Nearest
Y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
acc_knn

83.84

In [27]:
# Decision Tree Machine Learning Algorithm
#Training the Decision Tree Machine Learning Algorithm

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)

#Evaluation of the Decision Tree
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
acc_decision_tree

86.76

In [28]:
#Importing Sklearn for Simple Neural Network
#Importing other Needed Libraries

from sklearn.preprocessing import StandardScaler

#Scaling X_train for Simple Neural Network
#To fit the Model

scale = StandardScaler()
X_train = scale.fit_transform(X_train)

In [29]:
#Creation of Model using Keras Sequential for Simple Neural Network

from keras.models import Sequential
from keras.layers import Dense

def create_model(optimizer='adam', init='uniform'):
#Creation model and Layers

    model = Sequential()
    model.add(Dense(16, input_dim=X_train.shape[1], kernel_initializer=init, activation='relu'))
    model.add(Dense(8, kernel_initializer=init, activation='relu'))
    model.add(Dense(4, kernel_initializer=init, activation='relu'))
    model.add(Dense(1, kernel_initializer=init, activation='sigmoid'))

#Compiling the Model
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model


In [30]:
# Creation of Classifier
from keras.wrappers.scikit_learn import KerasClassifier

In [31]:
model_pred = KerasClassifier(build_fn=create_model, optimizer='adam', init='glorot_uniform', epochs=50, batch_size=10, verbose=0)
model_pred.fit(X_train, Y_train)

<keras.callbacks.History at 0x2304465a160>

In [32]:
#Scaling
X_test = scale.transform(X_test)

In [33]:
#Prediction of those who survived the Titanic Wreakage

prediction = model_pred.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(Y_pred, prediction))

from sklearn.metrics import accuracy_score
acc_snn=(accuracy_score(Y_pred, prediction)*100)
print(accuracy_score(Y_pred, prediction)*100)

#Displaying the Accuracy Score for the Machine Learning Algo.
#And the Simple Neural Network

models = pd.DataFrame({
    'Model': ['K-Nearest', 
              'Decision Tree','Neural Network'],
    'Accuracy Score': [acc_knn, 
              acc_decision_tree, acc_snn]})
models.sort_values(by='Accuracy Score', ascending=False)

              precision    recall  f1-score   support

           0       0.94      0.93      0.94       271
           1       0.87      0.89      0.88       147

    accuracy                           0.92       418
   macro avg       0.91      0.91      0.91       418
weighted avg       0.92      0.92      0.92       418

91.6267942583732


Unnamed: 0,Model,Accuracy Score
2,Neural Network,91.626794
1,Decision Tree,86.76
0,K-Nearest,83.84


In [34]:
#Expectedly, the Neural Network has the highest Accuracy Score
#Among the three Models

#Although both Decision Tree and KNN are non-parametric methods
#Decision Tree is Faster and have a higher Accuracy Score
#This may be due to KNN's expensive real time execution or
#Not having a auto feature interaction.

#Still, the Neural Network outperforms both Machine Learning Algos.
#As long as there is sufficient amount of data