In [1]:
import pandas as pd

In [2]:
import os
os.getcwd()

'C:\\Users\\jing.xun.ng'

In [3]:
'''
IMPORT TRAIN DATA
'''
data = pd.read_csv('train.csv')

print('Length of train data: ', len(data))
data.head(5)

Length of train data:  891


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
'''
IMPORT VALIDATION DATA - For Kaggle Submission
'''
val_data = pd.read_csv('test.csv')

print('Length of validation data: ', len(val_data))

Length of validation data:  418


In [5]:
'''
DATA EXPLORATION
'''

data_exploration = data.copy()

# distribution of passengers who survived / did not survive
# 0 = did not survive, 1 = survived
count_not_survived = data_exploration['Survived'].value_counts()[0]
count_survived = data_exploration['Survived'].value_counts()[1]

# distribution of passengers by gender
count_male = data_exploration['Sex'].value_counts()[0]
count_female = data_exploration['Sex'].value_counts()[1]

# distribution of passengers by survived AND gender
df_survived_gender = data_exploration.groupby(['Survived', 'Sex']).size().reset_index(name = 'Count')
count_male_survived = df_survived_gender[(df_survived_gender['Survived'] == 1) & (df_survived_gender['Sex'] == 'male')]['Count'].iloc[0]
count_male_not_survived = df_survived_gender[(df_survived_gender['Survived'] == 0) & (df_survived_gender['Sex'] == 'male')]['Count'].iloc[0]
count_female_survived = df_survived_gender[(df_survived_gender['Survived'] == 1) & (df_survived_gender['Sex'] == 'female')]['Count'].iloc[0]
count_female_not_survived = df_survived_gender[(df_survived_gender['Survived'] == 0) & (df_survived_gender['Sex'] == 'female')]['Count'].iloc[0]

# distribution of passengers by survived AND age
df_age_survived = pd.crosstab(pd.cut(data_exploration['Age'], bins = 10), data_exploration['Survived'])

# distribution of passengers by survived AND fare
df_fare_survived = pd.crosstab(pd.cut(data_exploration['Fare'], bins = 10), data_exploration['Survived'])

# distribution of passengers by survived AND Pclass (socio-economic class)
df_pclass_survived = data_exploration.groupby(['Pclass', 'Survived']).size().reset_index(name = 'Count')

# distribution of passengers by survived AND Parch (# of parents/children onboard)
df_parch_survived = data_exploration.groupby(['Parch', 'Survived']).size().reset_index(name = 'Count')

# distribution of passengers by survived AND SibSp (# of siblings spouse on board)
df_sibsp_survived = data_exploration.groupby(['SibSp', 'Survived']).size().reset_index(name = 'Count')

print(
'''

STATISTICS:

Number of rows in train data: {}

Number of NA in all columns:
{}


(1) BY GENDER

<1>
Number of passengers who survived: {}
Number of passengers who did not survive: {}

<2>
Number of male passengers: {}
Number of female passengers: {}

<3>
Number of male passengers who survived: {}
Number of male passengers who did not survive: {}
Number of female passengers who survived: {}
Number of female passengers who did not survived: {}


(2) BY AGE

{}


(3) BY PASSENGER FARE

{}


(4) BY PCLASS

{}


(5) BY # OF PARENTS/CHILDREN

{}


(5) BY # OF SIBLINGS / SPOUSES

{}


'''.format(len(data_exploration), data_exploration.isna().sum(),
           count_survived, count_not_survived, 
           count_male, count_female, 
           count_male_survived, count_male_not_survived,
           count_female_survived, count_female_not_survived, 
           df_age_survived,
           df_fare_survived,
           df_pclass_survived,
           df_parch_survived,
           df_sibsp_survived))



STATISTICS:

Number of rows in train data: 891

Number of NA in all columns:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


(1) BY GENDER

<1>
Number of passengers who survived: 342
Number of passengers who did not survive: 549

<2>
Number of male passengers: 577
Number of female passengers: 314

<3>
Number of male passengers who survived: 109
Number of male passengers who did not survive: 468
Number of female passengers who survived: 233
Number of female passengers who did not survived: 81


(2) BY AGE

Survived            0   1
Age                      
(0.34, 8.378]      18  36
(8.378, 16.336]    27  19
(16.336, 24.294]  114  63
(24.294, 32.252]  104  65
(32.252, 40.21]    66  52
(40.21, 48.168]    46  24
(48.168, 56.126]   24  21
(56.126, 64.084]   15   9
(64.084, 72.042]    9   0
(72.04

In [8]:
'''
FEATURE ENGINEERING
'''

def feature_engineering(df):
    # drop passengerid, Name (identifiers), cabin (mostly nans), ticket, embarked (low relevance to survived) columns
    df = df.drop(['PassengerId', 'Name', 'Cabin', 'Ticket', 'Embarked'], axis = 1)

    # convert age to categorical type using classification 
    # ref: https://www.statcan.gc.ca/en/concepts/definitions/age2
    # children (1), youth (2), adults (3), seniors (4), nans (0)
    df['Age'] = pd.to_numeric(pd.cut(df['Age'], bins = [0, 14, 24, 64, 120], labels = [1, 2, 3, 4]))
    df['Age'] = df['Age'].fillna(0)
    #data_feature_eng['Age'].replace({1: "Children", 2: "Youth", 3: "Adults", 4: "Seniors", 0: "No Data"}, inplace = True)

    # convert sibsp, parch to binary (whether siblings/spouse, parent/children is onboard (1) OR not (0))
    df.loc[df['SibSp'] == 0, 'SibSp'] = 0
    df.loc[df['SibSp'] >= 1, 'SibSp'] = 1
    df.loc[df['Parch'] == 0, 'Parch'] = 0
    df.loc[df['Parch'] >= 1, 'Parch'] = 1

    # convert passenger fare to first (>= $30), second (>= $13, <= 29), third (<= $12) class
    # ref: https://www.bbc.co.uk/bitesize/topics/z8mpfg8/articles/zng8jty
    df['Fare'] = df['Fare'].fillna(0)
    df.loc[df['Fare'] < 13, 'Fare'] = 3
    df.loc[(df['Fare'] >= 13) & (df['Fare'] < 30), 'Fare'] = 2
    df.loc[df['Fare'] >= 30, 'Fare'] = 1

    # convert sex column to binary (male (1), female (0))
    df.loc[df['Sex'] == 'male', 'Sex'] = 1
    df.loc[df['Sex'] == 'female', 'Sex'] = 0

    # remove Fare column since there is multicollinearity issues between Fare and Pclass columns
    #df = df.drop(['Fare'], axis = 1)
    return df

train_data_feature_eng = data.copy()
train_data_feature_eng = feature_engineering(train_data_feature_eng)

print('Number of rows: ', len(train_data_feature_eng))
train_data_feature_eng.head(20)

Number of rows:  891


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,1,2.0,1,0,3.0
1,1,1,0,3.0,1,0,1.0
2,1,3,0,3.0,0,0,3.0
3,1,1,0,3.0,1,0,1.0
4,0,3,1,3.0,0,0,3.0
5,0,3,1,0.0,0,0,3.0
6,0,1,1,3.0,0,0,1.0
7,0,3,1,1.0,1,1,2.0
8,1,3,0,3.0,0,1,3.0
9,1,2,0,1.0,1,0,1.0


In [17]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

# import classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import tree

# ensemble method
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier

train_data_model = train_data_feature_eng.copy()

feature_cols = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']

X = train_data_model[feature_cols]
y = train_data_model['Survived']

# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

# instantiate with default parameters
logreg = LogisticRegression()
knn = KNeighborsClassifier()
mlp = MLPClassifier(random_state = 0)
svm = SVC()
nb = GaussianNB()
clf = tree.DecisionTreeClassifier()
ada = AdaBoostClassifier()
rf = RandomForestClassifier()

# fitting the models to the data
logreg.fit(X_train, y_train)
knn.fit(X_train, y_train)
mlp.fit(X_train, y_train)
svm.fit(X_train, y_train)
nb.fit(X_train, y_train)
clf.fit(X_train, y_train)
ada.fit(X_train, y_train)
rf.fit(X_train, y_train)

# using the models to make predictions
logreg_y_pred = logreg.predict(X_test)
knn_y_pred = knn.predict(X_test)
mlp_y_pred = mlp.predict(X_test)
svm_y_pred = svm.predict(X_test)
nb_y_pred = nb.predict(X_test)
clf_y_pred = clf.predict(X_test)
ada_y_pred = ada.predict(X_test)
rf_y_pred = rf.predict(X_test)

# model prediction accuracy on test data
print('LogReg')
print("Accuracy: ", metrics.accuracy_score(y_test, logreg_y_pred))
print("Precision: ", metrics.precision_score(y_test, logreg_y_pred))
print("Recall: ", metrics.recall_score(y_test, logreg_y_pred))

print('KNN')
print("Accuracy: ", metrics.accuracy_score(y_test, knn_y_pred))
print("Precision: ", metrics.precision_score(y_test, knn_y_pred))
print("Recall: ", metrics.recall_score(y_test, knn_y_pred))

print('MLP')
print("Accuracy: ", metrics.accuracy_score(y_test, mlp_y_pred))
print("Precision: ", metrics.precision_score(y_test, mlp_y_pred))
print("Recall: ", metrics.recall_score(y_test, mlp_y_pred))

print('SVM')
print("Accuracy: ", metrics.accuracy_score(y_test, svm_y_pred))
print("Precision: ", metrics.precision_score(y_test, svm_y_pred))
print("Recall: ", metrics.recall_score(y_test, svm_y_pred))

print('Naive Bayes')
print("Accuracy: ", metrics.accuracy_score(y_test, nb_y_pred))
print("Precision: ", metrics.precision_score(y_test, nb_y_pred))
print("Recall: ", metrics.recall_score(y_test, nb_y_pred))

print('Decision Trees')
print("Accuracy: ", metrics.accuracy_score(y_test, clf_y_pred))
print("Precision: ", metrics.precision_score(y_test, clf_y_pred))
print("Recall: ", metrics.recall_score(y_test, clf_y_pred))

print('ADA Boost')
print("Accuracy: ", metrics.accuracy_score(y_test, ada_y_pred))
print("Precision: ", metrics.precision_score(y_test, ada_y_pred))
print("Recall: ", metrics.recall_score(y_test, ada_y_pred))

print('Random Forest')
print("Accuracy: ", metrics.accuracy_score(y_test, rf_y_pred))
print("Precision: ", metrics.precision_score(y_test, rf_y_pred))
print("Recall: ", metrics.recall_score(y_test, rf_y_pred))



LogReg
Accuracy:  0.7757847533632287
Precision:  0.7023809523809523
Recall:  0.7023809523809523
KNN
Accuracy:  0.7757847533632287
Precision:  0.75
Recall:  0.6071428571428571
MLP
Accuracy:  0.7847533632286996
Precision:  0.7307692307692307
Recall:  0.6785714285714286
SVM
Accuracy:  0.7892376681614349
Precision:  0.7402597402597403
Recall:  0.6785714285714286
Naive Bayes
Accuracy:  0.7713004484304933
Precision:  0.6736842105263158
Recall:  0.7619047619047619
Decision Trees
Accuracy:  0.7713004484304933
Precision:  0.7037037037037037
Recall:  0.6785714285714286
ADA Boost
Accuracy:  0.7802690582959642
Precision:  0.7108433734939759
Recall:  0.7023809523809523
Random Forest
Accuracy:  0.7713004484304933
Precision:  0.7037037037037037
Recall:  0.6785714285714286




In [16]:
'''
DATA VALIDATION USING UNSEEN DATA 
'''

val_data_feature_eng = val_data.copy()

# apply same feature engineering steps to validation data
val_data_feature_eng = feature_engineering(val_data_feature_eng)

# predicting Survived using trained model
y_val = knn.predict(val_data_feature_eng)
y_val = y_val.tolist()

print('Length of y_val: ', len(y_val))

Length of y_val:  418




In [92]:
'''
PREPARING PREDICTIONS FOR SUBMISSION
'''
val_data_submission = val_data.copy()

val_data_submission = val_data_submission[['PassengerId']]
val_data_submission['Survived'] = y_val
val_data_submission.to_csv('kaggle_titanic_knn_submission.csv', index = False)