# Competition Details
#### Platform: Kaggle
#### Competition Title: Titanic - Machine Learning from Disaster
#### Link: https://www.kaggle.com/c/titanic/overview
#### Created by Jonas Ng Jing Xun
##### LinkedIn: https://sg.linkedin.com/in/jonasnjx
##### Github: https://github.com/jonasnjx
###### Thanks for viewing!

In [1]:
import os
os.getcwd()

'C:\\Users\\jing.xun.ng'

## Import Train / Validation data

In [2]:
import pandas as pd
import numpy as np

In [3]:
'''
IMPORT TRAIN DATA
'''
data = pd.read_csv('kaggle_titanic/train.csv')

print('Length of train data: ', len(data))
data.head(5)

Length of train data:  891


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
'''
IMPORT VALIDATION DATA - For Kaggle Submission
'''
val_data = pd.read_csv('kaggle_titanic/test.csv')

print('Length of validation data: ', len(val_data))

Length of validation data:  418


## Data Exploration

In [5]:
'''
DATA EXPLORATION
'''

data_exploration = data.copy()

# distribution of passengers who survived / did not survive
# 0 = did not survive, 1 = survived
count_not_survived = data_exploration['Survived'].value_counts()[0]
count_survived = data_exploration['Survived'].value_counts()[1]

# distribution of passengers by gender
count_male = data_exploration['Sex'].value_counts()[0]
count_female = data_exploration['Sex'].value_counts()[1]

# distribution of passengers by survived AND gender
df_survived_gender = data_exploration.groupby(['Survived', 'Sex']).size().reset_index(name = 'Count')
count_male_survived = df_survived_gender[(df_survived_gender['Survived'] == 1) & (df_survived_gender['Sex'] == 'male')]['Count'].iloc[0]
count_male_not_survived = df_survived_gender[(df_survived_gender['Survived'] == 0) & (df_survived_gender['Sex'] == 'male')]['Count'].iloc[0]
count_female_survived = df_survived_gender[(df_survived_gender['Survived'] == 1) & (df_survived_gender['Sex'] == 'female')]['Count'].iloc[0]
count_female_not_survived = df_survived_gender[(df_survived_gender['Survived'] == 0) & (df_survived_gender['Sex'] == 'female')]['Count'].iloc[0]

# distribution of passengers by survived AND age
df_age_survived = pd.crosstab(pd.cut(data_exploration['Age'], bins = 10), data_exploration['Survived'])

# distribution of passengers by survived AND fare
df_fare_survived = pd.crosstab(pd.cut(data_exploration['Fare'], bins = 10), data_exploration['Survived'])

# distribution of passengers by survived AND Pclass (socio-economic class)
df_pclass_survived = data_exploration.groupby(['Pclass', 'Survived']).size().reset_index(name = 'Count')

# distribution of passengers by survived AND Parch (# of parents/children onboard)
df_parch_survived = data_exploration.groupby(['Parch', 'Survived']).size().reset_index(name = 'Count')

# distribution of passengers by survived AND SibSp (# of siblings spouse on board)
df_sibsp_survived = data_exploration.groupby(['SibSp', 'Survived']).size().reset_index(name = 'Count')

print(
'''

STATISTICS:

Number of rows in train data: {}

Number of NA in all columns:
{}


(1) BY GENDER

<1>
Number of passengers who survived: {}
Number of passengers who did not survive: {}

<2>
Number of male passengers: {}
Number of female passengers: {}

<3>
Number of male passengers who survived: {}
Number of male passengers who did not survive: {}
Number of female passengers who survived: {}
Number of female passengers who did not survived: {}


(2) BY AGE

{}


(3) BY PASSENGER FARE

{}


(4) BY PCLASS

{}


(5) BY # OF PARENTS/CHILDREN

{}


(5) BY # OF SIBLINGS / SPOUSES

{}


'''.format(len(data_exploration), data_exploration.isna().sum(),
           count_survived, count_not_survived, 
           count_male, count_female, 
           count_male_survived, count_male_not_survived,
           count_female_survived, count_female_not_survived, 
           df_age_survived,
           df_fare_survived,
           df_pclass_survived,
           df_parch_survived,
           df_sibsp_survived))



STATISTICS:

Number of rows in train data: 891

Number of NA in all columns:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


(1) BY GENDER

<1>
Number of passengers who survived: 342
Number of passengers who did not survive: 549

<2>
Number of male passengers: 577
Number of female passengers: 314

<3>
Number of male passengers who survived: 109
Number of male passengers who did not survive: 468
Number of female passengers who survived: 233
Number of female passengers who did not survived: 81


(2) BY AGE

Survived            0   1
Age                      
(0.34, 8.378]      18  36
(8.378, 16.336]    27  19
(16.336, 24.294]  114  63
(24.294, 32.252]  104  65
(32.252, 40.21]    66  52
(40.21, 48.168]    46  24
(48.168, 56.126]   24  21
(56.126, 64.084]   15   9
(64.084, 72.042]    9   0
(72.04

## Feature Engineering (Create new columns, merge columns etc)

In [9]:
'''
FEATURE ENGINEERING
'''

def feature_engineering(df):
    # drop passengerid, Name, ticket columns (identifiers, too many unique values, low relevance)
    df = df.drop(['PassengerId', 'Name', 'Ticket'], axis = 1)

    # change pclass to categorical
    df['Pclass'].replace({1: "HighSES", 2: "MidSES", 3: "LowSES"}, inplace = True)    
    
    # convert age to categorical type using classification 
    # ref: https://www.statcan.gc.ca/en/concepts/definitions/age2
    # children (1), youth (2), adults (3), seniors (4), nans (0)
    df['Age'] = pd.to_numeric(pd.cut(df['Age'], bins = [0, 14, 24, 64, 120], labels = [1, 2, 3, 4]))
    df['Age'] = df['Age'].fillna(0)
    df['Age'].replace({1: "Children", 2: "Youth", 3: "Adults", 4: "Seniors", 0: "NoAge"}, inplace = True)

    # convert sibsp, parch to binary (whether siblings/spouse, parent/children is onboard (1) OR not (0))
    df.loc[df['SibSp'] == 0, 'SibSp'] = 0
    df.loc[df['SibSp'] >= 1, 'SibSp'] = 1
    df.loc[df['Parch'] == 0, 'Parch'] = 0
    df.loc[df['Parch'] >= 1, 'Parch'] = 1
    df['SibSp'].replace({0: "NoSibSp", 1: "YesSibSp"}, inplace = True)
    df['Parch'].replace({0: "NoParch", 1: "YesParch"}, inplace = True)

    # convert sex column to binary (male (1), female (0))
    df.loc[df['Sex'] == 'male', 'Sex'] = 'Male'
    df.loc[df['Sex'] == 'female', 'Sex'] = 'Female'
    
    # convert passenger fare to first (>= $30), second (>= $13, <= 29), third (<= $12) class
    # ref: https://www.bbc.co.uk/bitesize/topics/z8mpfg8/articles/zng8jty
    df['Fare'] = df['Fare'].fillna(0)
    df.loc[df['Fare'] < 13, 'Fare'] = 3
    df.loc[(df['Fare'] >= 13) & (df['Fare'] < 30), 'Fare'] = 2
    df.loc[df['Fare'] >= 30, 'Fare'] = 1
    df['Fare'].replace({1: "FirstClsFare", 2: "SecondClsFare", 3: "ThirdClsFare", 0: "NoClsFare"}, inplace = True)

    # remove Fare column since there is multicollinearity issue between Fare and Pclass columns
    df = df.drop(['Fare'], axis = 1)
    
    # convert embarked to numerical (S = 1, C = 2, Q = 3, nan = 0)
#    df.loc[df['Embarked'] == 'S', 'Embarked'] = 1
#    df.loc[df['Embarked'] == 'C', 'Embarked'] = 2
#    df.loc[df['Embarked'] == 'Q', 'Embarked'] = 3

    # fill nans with most common embarked area (S)
    df['Embarked'] = df['Embarked'].fillna('S')
    
    # performance drops with column
#    df = df.drop(['Embarked'], axis = 1)

    # convert cabin to binary (value = 1, null = 0)
    df['Cabin'] = np.where(df['Cabin'].isnull(), 0, 1)
    
    # performance drops with column
    df = df.drop(['Cabin'], axis = 1)
    
    # combine SibSp/Parch columns
#    df['SibSp_Parch'] = df['SibSp'] + df['Parch']
    
    return df

train_data_feature_eng = data.copy()
train_data_feature_eng = feature_engineering(train_data_feature_eng)

print('Number of rows: ', len(train_data_feature_eng))
train_data_feature_eng.head(20)

Number of rows:  891


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,LowSES,Male,Youth,YesSibSp,NoParch,S
1,1,HighSES,Female,Adults,YesSibSp,NoParch,C
2,1,LowSES,Female,Adults,NoSibSp,NoParch,S
3,1,HighSES,Female,Adults,YesSibSp,NoParch,S
4,0,LowSES,Male,Adults,NoSibSp,NoParch,S
5,0,LowSES,Male,NoAge,NoSibSp,NoParch,Q
6,0,HighSES,Male,Adults,NoSibSp,NoParch,S
7,0,LowSES,Male,Children,YesSibSp,YesParch,S
8,1,LowSES,Female,Adults,NoSibSp,YesParch,S
9,1,MidSES,Female,Children,YesSibSp,NoParch,C


## Testing out different algorithms on dataset

In [10]:
'''
MODEL DEVELOPMENT
'''
import time

from numpy import mean
from numpy import std

from sklearn.model_selection import train_test_split
from sklearn import metrics

# import classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import tree

# ensemble method
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# CV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

train_data_model = train_data_feature_eng.copy()

# get list of column headers without target variable
feature_cols = list(train_data_model.columns.values)
feature_cols.remove('Survived')

# one-hot encoded X inputs
encoded_X_train = pd.get_dummies(train_data_feature_eng[feature_cols])

X = encoded_X_train
y = train_data_model['Survived']

# instantiate with default parameters
logreg = LogisticRegression()
knn = KNeighborsClassifier()
mlp = MLPClassifier(random_state = 0)
svm = SVC()
nb = GaussianNB()
clf = tree.DecisionTreeClassifier()
ada = AdaBoostClassifier()
rf = RandomForestClassifier(max_depth = 5, random_state = 0)
gbm = GradientBoostingClassifier()
histgbm = HistGradientBoostingClassifier()
xgboost = XGBClassifier()
lightgbm = LGBMClassifier()

# insert new models and its name here
all_models_dict = {logreg: 'Logistics Regression', 
                   knn: 'K-Nearest Neighbors',
                   mlp: 'Multilayer Perceptron',
                   svm: 'Support Vector Machines',
                   nb: 'Naive Bayes',
                   clf: 'Decision Trees',
                   ada: 'ADABoost',
                   rf: 'Random Forest',
                   gbm: 'Gradient Boosting Classifier',
                   histgbm: 'Histogram Gradient Boosting Classifier',
                   xgboost: 'XGBoost',
                   lightgbm: 'LightGBM'}

# perform k-fold CV on model
def cross_val_evaluation(model):
    cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 1)
    n_scores = cross_val_score(model, X, y, scoring = 'accuracy', cv = cv, n_jobs = -1, error_score = 'raise')
    mean_n_scores = mean(n_scores)
    std_n_scores = std(n_scores)
    return mean_n_scores, std_n_scores

# dataframe to store performance of all model
model_performance = pd.DataFrame()

# lists to store all means, std, names
all_mean_scores = []
all_std_scores = []
all_model_names = []

# iteration to run all models, and store evaluation result / names into list
for model, model_name in all_models_dict.items():
    start_time = time.time()
    n_mean, n_std = cross_val_evaluation(model)
    model.fit(X, y)
    all_mean_scores.append(n_mean)
    all_std_scores.append(n_std)
    all_model_names.append(model_name)
    end_time = time.time()
    print(
    '''Time to execute {}: {} seconds'''.format(model_name, end_time - start_time))
    
# append result / names into dataframe
model_performance['Model Name'] = all_model_names
model_performance['Accuracy (Mean)'] = all_mean_scores
model_performance['Accuracy (STD)'] = all_std_scores

Time to execute Logistics Regression: 0.4806675910949707 seconds
Time to execute K-Nearest Neighbors: 0.2696065902709961 seconds




Time to execute Multilayer Perceptron: 11.175747394561768 seconds
Time to execute Support Vector Machines: 0.4478445053100586 seconds
Time to execute Naive Bayes: 0.1150064468383789 seconds
Time to execute Decision Trees: 0.10464954376220703 seconds
Time to execute ADABoost: 1.2727117538452148 seconds
Time to execute Random Forest: 2.1067144870758057 seconds
Time to execute Gradient Boosting Classifier: 1.424267292022705 seconds
Time to execute Histogram Gradient Boosting Classifier: 13.12043809890747 seconds




Time to execute XGBoost: 4.698972225189209 seconds
Time to execute LightGBM: 3.5442821979522705 seconds


## Choosing the best algorithm

In [11]:
'''
SELECT BEST PERFORMING MODEL
'''

model_performance.sort_values('Accuracy (Mean)', ascending = False)

Unnamed: 0,Model Name,Accuracy (Mean),Accuracy (STD)
3,Support Vector Machines,0.816675,0.038196
7,Random Forest,0.815543,0.038811
2,Multilayer Perceptron,0.815539,0.044572
8,Gradient Boosting Classifier,0.813296,0.038479
5,Decision Trees,0.809222,0.040122
9,Histogram Gradient Boosting Classifier,0.805834,0.037233
11,LightGBM,0.804719,0.037718
10,XGBoost,0.801361,0.040404
1,K-Nearest Neighbors,0.799842,0.044107
0,Logistics Regression,0.792734,0.038988


## Predicting Survived or not for validation data

In [12]:
'''
Predict Survived for unseen data
'''

val_data_feature_eng = val_data.copy()

# apply same feature engineering steps to validation data
val_data_feature_eng = feature_engineering(val_data_feature_eng)

# one-hot encoded validation X
encoded_X_val = pd.get_dummies(val_data_feature_eng)

# predicting Survived using trained model
y_val = svm.predict(encoded_X_val)
y_val = y_val.tolist()

print('Length of y_val: ', len(y_val))

Length of y_val:  418


## Creating CSV file for submission
### Competition details: https://www.kaggle.com/c/titanic/overview

In [None]:
'''
PREPARING PREDICTIONS FOR SUBMISSION
'''
val_data_submission = val_data.copy()

val_data_submission = val_data_submission[['PassengerId']]
val_data_submission['Survived'] = y_val
val_data_submission.to_csv('kaggle_titanic_svm_submission.csv', index = False)

## Appendix

In [None]:
'''
UNUSED CODE
'''

# MODEL EVALUATION USING TRAIN-TEST SPLIT
def evaluate_model(y_pred):
    print(
'''
Accuracy: {}
Precision: {}
Recall: {}
F1 Score: {}

'''.format(metrics.accuracy_score(y_test, y_pred),
           metrics.precision_score(y_test, y_pred),
           metrics.recall_score(y_test, y_pred),
           metrics.f1_score(y_test, y_pred)))

    
# TRAIN-TEST SPLIT
'''
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

# using the models to make predictions after model.fit and train-test split
logreg_y_pred = logreg.predict(X_test)
knn_y_pred = knn.predict(X_test)
mlp_y_pred = mlp.predict(X_test)
svm_y_pred = svm.predict(X_test)
nb_y_pred = nb.predict(X_test)
clf_y_pred = clf.predict(X_test)
ada_y_pred = ada.predict(X_test)
rf_y_pred = rf.predict(X_test)
gbm_y_pred = gbm.predict(X_test)

# model prediction accuracy on test data
print('LogReg')
evaluate_model(logreg_y_pred)
print('KNN')
evaluate_model(knn_y_pred)
print('MLP')
evaluate_model(mlp_y_pred)
print('SVM')
evaluate_model(svm_y_pred)
print('Naive Bayes')
evaluate_model(nb_y_pred)
print('Decision Trees')
evaluate_model(clf_y_pred)
print('ADA Boost')
evaluate_model(ada_y_pred)
print('Random Forest')
evaluate_model(rf_y_pred)
print('GBM')
evaluate_model(gbm_y_pred)
'''

# LONG METHOD TO VALIDATE MODEL USING KFOLD

'''
def cross_val_evaluation(model):
    cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 1)
    n_scores = cross_val_score(model, X, y, scoring = 'accuracy', cv = cv, n_jobs = -1, error_score = 'raise')
    print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

# fitting the models to the data
model_name = 'Logistic Regression'
print(model_name)
cross_val_evaluation(logreg)
logreg.fit(X_train, y_train)

print('KNN')
cross_val_evaluation(knn)
knn.fit(X_train, y_train)

print('MLP')
cross_val_evaluation(mlp)
mlp.fit(X_train, y_train)

print('SVM')
cross_val_evaluation(svm)
svm.fit(X_train, y_train)

print('Naive Bayes')
cross_val_evaluation(nb)
nb.fit(X_train, y_train)

print('Decision Trees')
cross_val_evaluation(clf)
clf.fit(X_train, y_train)

print('ADA Boost')
cross_val_evaluation(ada)
ada.fit(X_train, y_train)

print('Random Forest')
cross_val_evaluation(rf)
rf.fit(X_train, y_train)

print('GBM')
cross_val_evaluation(gbm)
gbm.fit(X_train, y_train)

print('HistGBM')
cross_val_evaluation(histgbm)
histgbm.fit(X_train, y_train)

print('XGBoost')
cross_val_evaluation(xgboost)
xgboost.fit(X_train, y_train)

print('LightGBM')
cross_val_evaluation(lightgbm)
lightgbm.fit(X_train, y_train)
'''