# Titanic v3

#### Import Libraries

In [1]:
# Standard Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import random as random

# Function to fill missing values
from sklearn.impute import SimpleImputer
from random import randint

# EDA/ Data Preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.compose import make_column_transformer

# Model Splitting
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import VarianceThreshold

# Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

# Scoring Metrics
from sklearn.metrics import *

#### Settings

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None) 

#### Read Data

In [3]:
# Read CSV file
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

#### Initial Analysis of Dataset

In [4]:
# Shape of Data
print(f"Shape of Train set (raw) : {train.shape}")
print(f"Shape of Test set (raw) : {test.shape}")

Shape of Train set (raw) : (891, 12)
Shape of Test set (raw) : (418, 11)


In [5]:
train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
test.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

#### Things to Note:
- `train` set missing values: `Cabin` > `Age` > `Embarked`.
- `test` set missing values: `Cabin` > `Age` > `Fare`.

- Total: **1309 observations**
- `train` have 891 observations and 12 variables (including `PassengerId`).
- `test` have 418 observations and 11 variables (including `PassengerId` and excluding `Survived`.

# Plan

- `PassengerId` : drop from train, keep for test.

- `Survived` : Approximately 38.3% of people survived.

- `Pclass` : Do not touch it.

- `Sex` : 2 different sexes only, use one-hot-encoding.

- `SibSp` + `Parch` : combine to `familysize`, convert to `isAlone`.

- `Ticket` + `Cabin` : Uninformative and too many missing values respectively, drop.

- `Embarked` : 2 missing values, impute with mode.

- `Fare` : Impute with Median. Consider applying log and group to `fare_range`

- `Name` : Separate out the titles, generalize for uncommon titles.

- `Age` : Impute with values from Pclass and Sex. 

- `Age*Pclass` : A lot of examples have it, try for this.

## PassengerId

In [7]:
# Drop for train, keep for test.
train.drop('PassengerId', axis = 'columns', inplace = True)

test_id = test[['PassengerId']]
test.drop('PassengerId', axis = 'columns', inplace = True)

In [8]:
# Shape of Data
print(f"Shape of Train set (raw) : {train.shape}")
print(f"Shape of Test set (raw) : {test.shape}")

Shape of Train set (raw) : (891, 11)
Shape of Test set (raw) : (418, 10)


In [9]:
test_id.head()

Unnamed: 0,PassengerId
0,892
1,893
2,894
3,895
4,896


## Survived

In [10]:
# Do nothing.

## Pclass

In [11]:
# Do nothing. Exploration of Pclass-Sex Survived, Pclass-Age, Pclass-embarked, Pclass-Fare done later if needed.

## Sex

In [12]:
# Use OHE.
gender_dict = {'male' : 0, 'female' : 1}

train['Sex'] = train['Sex'].map(gender_dict)
test['Sex'] = test['Sex'].map(gender_dict)

## SibSp and Parch

In [13]:
# Create Variable familysize
train['familysize'] = train['SibSp'] + train['Parch']
test['familysize'] = test['SibSp'] + test['Parch']

# Create isAlone -> 0: not alone, 1: is alone
train.loc[train['familysize'] > 0, 'isAlone'] = 0
train.loc[train['familysize'] == 0, 'isAlone'] = 0

test.loc[test['familysize'] > 0, 'isAlone'] = 0
test.loc[test['familysize'] == 0, 'isAlone'] = 0

# Convert isAlone to int
train['isAlone'] = train['isAlone'].astype(int)
test['isAlone'] = test['isAlone'].astype(int)

In [14]:
# # Drop 'SibSp', 'Parch', 'familysize'
# train.drop(columns = ['familysize', 'SibSp', 'Parch'], axis = 'columns', inplace = True)
# test.drop(columns = ['familysize', 'SibSp', 'Parch'], axis = 'columns', inplace = True)

In [15]:
# Shape of Data
print(f"Shape of Train set (raw) : {train.shape}")
print(f"Shape of Test set (raw) : {test.shape}")

Shape of Train set (raw) : (891, 13)
Shape of Test set (raw) : (418, 12)


## Ticket and Cabin

In [16]:
# Drop both
train.drop(columns = ['Ticket', 'Cabin'], axis = 'columns', inplace = True)
test.drop(columns = ['Ticket', 'Cabin'], axis = 'columns', inplace = True)

In [17]:
# Shape of Data
print(f"Shape of Train set (raw) : {train.shape}")
print(f"Shape of Test set (raw) : {test.shape}")

Shape of Train set (raw) : (891, 11)
Shape of Test set (raw) : (418, 10)


## Embarked

In [18]:
# Impute with mode for train only
train['Embarked'].fillna(train['Embarked'].mode()[0], inplace = True)

In [19]:
# Convert to ordinal for train and test
port_dict = {'Q': 1, 'C': 2, 'S': 3}

train['Embarked'] = train['Embarked'].map(port_dict)
test['Embarked'] = test['Embarked'].map(port_dict)

In [20]:
# Convert to int
train['Embarked'] = train['Embarked'].astype(int)
test['Embarked'] = test['Embarked'].astype(int)

In [21]:
# Shape of Data
print(f"Shape of Train set (raw) : {train.shape}")
print(f"Shape of Test set (raw) : {test.shape}")

Shape of Train set (raw) : (891, 11)
Shape of Test set (raw) : (418, 10)


## Fare

In [22]:
# Impute fare with median first
test['Fare'].fillna(test['Fare'].median(), inplace = True)

In [23]:
# Apply log transformation
train['log_fare'] = np.log1p(train['Fare'])
test['log_fare'] = np.log1p(test['Fare'])

# Drop Fare
train.drop('Fare', axis = 'columns', inplace = True)
test.drop('Fare', axis = 'columns', inplace = True)

In [24]:
# Change the range
train['log_fare_range'] = pd.cut(train['log_fare'], 4)
train[['log_fare_range', 'Survived']].groupby(['log_fare_range'], 
                                              as_index=False).mean().sort_values(by='log_fare_range', 
                                                                                 ascending=True)

# Change range for test
test['log_fare_range'] = pd.cut(test['log_fare'], 4)

In [25]:
# convert log_fare to numerical
# Convert log_fare to log_fare range for train
train.loc[(train['log_fare'] <= 1.56), 'log_fare'] = 0
train.loc[(train['log_fare'] <= 3.12)  & (train['log_fare'] > 1.56), 'log_fare'] = 1
train.loc[(train['log_fare'] <= 4.681)  & (train['log_fare'] > 3.12), 'log_fare'] = 2
train.loc[(train['log_fare'] > 4.681), 'log_fare'] = 3

train['log_fare'] = train['log_fare'].astype(int)

In [26]:
# convert log_fare to numerical for test
test.loc[(test['log_fare'] <= 1.56), 'log_fare'] = 0
test.loc[(test['log_fare'] <= 3.12)  & (test['log_fare'] > 1.56), 'log_fare'] = 1
test.loc[(test['log_fare'] <= 4.681)  & (test['log_fare'] > 3.12), 'log_fare'] = 2
test.loc[(test['log_fare'] > 4.681), 'log_fare'] = 3

test['log_fare'] = test['log_fare'].astype(int)

In [27]:
# drop log_fare_range in train and test
train.drop('log_fare_range', axis = 'columns', inplace = True)
test.drop('log_fare_range', axis = 'columns', inplace = True)

In [28]:
# Shape of Data
print(f"Shape of Train set (raw) : {train.shape}")
print(f"Shape of Test set (raw) : {test.shape}")

Shape of Train set (raw) : (891, 11)
Shape of Test set (raw) : (418, 10)


## Name

In [29]:
# Extract titles from training data and test data
train['Title'] = train['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
test['Title'] = test['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# Uncomment to check the unique titles given
# print(train['Title'].value_counts())
# print(test['Title'].value_counts())

In [30]:
# Set the dictionary
TitleDict = {"Capt": "Officer", "Col": "Officer", "Major": "Officer", "Jonkheer": "Royalty", \
             "Don": "Royalty", "Sir" : "Royalty","Dr": "Royalty","Rev": "Royalty", \
             "Countess":"Royalty", "Mme": "Mrs", "Mlle": "Miss", "Ms": "Mrs","Mr" : "Mr", \
             "Mrs" : "Mrs","Miss" : "Miss","Master" : "Master","Lady" : "Royalty", "Dona": "Royalty"}

# Map the dict
train['Title'], test['Title'] = [df.Title.map(TitleDict) for df in [train, test]]

In [31]:
# # Replacing titles
# train['Title'] = train['Title'].replace(['Countess', 'Capt', 'Rev', 'Col', 'Don', 'Dr', 'Jonkheer', 'Lady', 
#                                          'Major', 'Dona'], 'others')
# train['Title'] = train['Title'].replace('Ms', 'Miss')
# train['Title'] = train['Title'].replace('Mlle', 'Miss')
# train['Title'] = train['Title'].replace('Mme', 'Mrs')
# train['Title'] = train['Title'].replace('Sir', 'Mr')


# test['Title'] = test['Title'].replace(['Countess', 'Capt', 'Rev', 'Col', 'Don', 'Dr', 'Jonkheer', 'Lady', 
#                                        'Major', 'Dona'], 'others')
# test['Title'] = test['Title'].replace('Ms', 'Miss')
# test['Title'] = test['Title'].replace('Mlle', 'Miss')
# test['Title'] = test['Title'].replace('Mme', 'Mrs')
# test['Title'] = test['Title'].replace('Sir', 'Mr')

# Check the unique titles given
print(train['Title'].value_counts())
print(test['Title'].value_counts())

Mr         517
Miss       184
Mrs        127
Master      40
Royalty     18
Officer      5
Name: Title, dtype: int64
Mr         240
Miss        78
Mrs         73
Master      21
Royalty      4
Officer      2
Name: Title, dtype: int64


#### Information about titles
- `Don` : noble men
- `Lady`, `Countess` : noble women
- `Jonkheer` : young man or woman of nobility

In [32]:
# # Create mapping
# title_dict = {'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master' : 4, 'others': 5}

# # Map
# train['Title'] = train['Title'].map(title_dict).astype(int)
# test['Title'] = test['Title'].map(title_dict).astype(int)

In [33]:
# Drop name in test and train
train.drop('Name', axis = 'columns', inplace = True)
test.drop('Name', axis = 'columns', inplace = True)

In [34]:
# Shape of Data
print(f"Shape of Train set (raw) : {train.shape}")
print(f"Shape of Test set (raw) : {test.shape}")

Shape of Train set (raw) : (891, 11)
Shape of Test set (raw) : (418, 10)


## Age

In [35]:
test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked,familysize,isAlone,log_fare,Title
0,3,0,34.5,0,0,1,0,0,1,Mr
1,3,1,47.0,1,0,3,1,0,1,Mrs
2,2,0,62.0,0,0,1,0,0,1,Mr
3,3,0,27.0,0,0,3,0,0,1,Mr
4,3,1,22.0,1,1,3,2,0,1,Mrs


In [36]:
# See the average age
train.groupby(['Title', 'Pclass'])['Age'].agg(['mean', 'count'])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,count
Title,Pclass,Unnamed: 2_level_1,Unnamed: 3_level_1
Master,1,5.306667,3
Master,2,2.258889,9
Master,3,5.350833,24
Miss,1,29.744681,47
Miss,2,22.390625,32
Miss,3,16.123188,69
Mr,1,41.58046,87
Mr,2,32.768293,82
Mr,3,28.724891,229
Mrs,1,40.4,35


In [37]:
# categorize a new title = 'FemaleChild' because the variation of Miss varies too greatly, so depends on parch
train.loc[(train.Title=='Miss') & (train.Parch!=0) & (train.familysize>1), 'Title']="FemaleChild"
test.loc[(test.Title=='Miss') & (test.Parch!=0) & (test.familysize>1), 'Title']="FemaleChild"

In [38]:
# Create lookup table
grp = train.groupby(['Pclass','Sex','Title'])['Age'].mean().reset_index()[['Sex', 'Pclass', 'Title', 'Age']]

In [39]:
grp.head()

Unnamed: 0,Sex,Pclass,Title,Age
0,0,1,Master,5.306667
1,0,1,Mr,41.58046
2,0,1,Officer,56.6
3,0,1,Royalty,42.166667
4,1,1,FemaleChild,22.25


In [40]:
# x -> row that has the missing age value
def fill_age(x):
    return grp[(grp.Pclass==x.Pclass)&(grp.Sex==x.Sex)&(grp.Title==x.Title)]['Age'].values[0]

In [41]:
# Apply for train and test sets
train['Age'], test['Age'] = [df.apply(lambda x: fill_age(x) if np.isnan(x['Age']) else x['Age'], axis=1) \
                             for df in [train, test]]

#### Note

**Imputed values of age are based on the means of the training data ONLY. This is done to prevent data leakage into the test set.**

In [42]:
# Shape of Data
print(f"Shape of Train set (raw) : {train.shape}")
print(f"Shape of Test set (raw) : {test.shape}")

Shape of Train set (raw) : (891, 11)
Shape of Test set (raw) : (418, 10)


In [43]:
# Group Age into ranges
train['AgeRange'] = pd.cut(train['Age'], 5)
train[['AgeRange', 'Survived']].groupby(['AgeRange'], as_index=False).mean().sort_values(by='AgeRange', 
                                                                                         ascending=True)

Unnamed: 0,AgeRange,Survived
0,"(0.34, 16.336]",0.513274
1,"(16.336, 32.252]",0.335498
2,"(32.252, 48.168]",0.415254
3,"(48.168, 64.084]",0.434783
4,"(64.084, 80.0]",0.090909


In [44]:
# Convert Age to age range for train
train.loc[(train['Age'] <= 16), 'Age'] = 0
train.loc[(train['Age'] <= 32)  & (train['Age'] > 16), 'Age'] = 1
train.loc[(train['Age'] <= 48)  & (train['Age'] > 32), 'Age'] = 2
train.loc[(train['Age'] <= 64)  & (train['Age'] > 48), 'Age'] = 3
train.loc[(train['Age'] > 64), 'Age'] = 4

train['Age'] = train['Age'].astype(int)

In [45]:
# Convert Age to age range for test
test.loc[(test['Age'] <= 16), 'Age'] = 0
test.loc[(test['Age'] <= 32)  & (test['Age'] > 16), 'Age'] = 1
test.loc[(test['Age'] <= 48)  & (test['Age'] > 32), 'Age'] = 2
test.loc[(test['Age'] <= 64)  & (test['Age'] > 48), 'Age'] = 3
test.loc[(test['Age'] > 64), 'Age'] = 4

test['Age'] = test['Age'].astype(int)

In [46]:
# Drop 'AgeRange'
train.drop('AgeRange', axis = 'columns', inplace = True)

In [47]:
# Shape of Data
print(f"Shape of Train set (raw) : {train.shape}")
print(f"Shape of Test set (raw) : {test.shape}")

Shape of Train set (raw) : (891, 11)
Shape of Test set (raw) : (418, 10)


# Further Feature Engineering

# Split Data

In [48]:
train.Title.value_counts()

Mr             517
Miss           131
Mrs            127
FemaleChild     53
Master          40
Royalty         18
Officer          5
Name: Title, dtype: int64

In [49]:
# Convert title to numerical
title_dict = {'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master' : 4, 'Royalty': 5, 'FemaleChild': 6, 'Officer':7}

# Map
train['Title'] = train['Title'].map(title_dict).astype(int)
test['Title'] = test['Title'].map(title_dict).astype(int)

In [50]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked,familysize,isAlone,log_fare,Title
0,0,3,0,1,1,0,3,1,0,1,1
1,1,1,1,2,1,0,2,1,0,2,3
2,1,3,1,1,0,0,3,0,0,1,2
3,1,1,1,2,1,0,3,1,0,2,3
4,0,3,0,2,0,0,3,0,0,1,1


In [51]:
# Drop all relevant columns
train.drop(columns = ['SibSp', 'Parch', 'familysize'], axis = 'columns', inplace = True)
test.drop(columns = ['SibSp', 'Parch', 'familysize'], axis = 'columns', inplace = True)

In [52]:
# Shape of Data
print(f"Shape of Train set (raw) : {train.shape}")
print(f"Shape of Test set (raw) : {test.shape}")

Shape of Train set (raw) : (891, 8)
Shape of Test set (raw) : (418, 7)


In [53]:
X_train, X_test, y_train, y_test = train_test_split(train.iloc[:, 1:], train.iloc[:, 0], test_size = 0.2,
                                                    random_state = 26)

In [54]:
# Print scores
def print_score(test, pred):
    print(f"Accuracy : {round(accuracy_score(test, pred), 3)}%")
    print(f"Recall : {round(recall_score(test, pred), 3)}%")
    print(f"Precision : {round(precision_score(test, pred), 3)}%")
    print(f"f1 Score : {round(f1_score(test, pred), 3)}%")

# Models

In [55]:
# Models
dt = DecisionTreeClassifier(random_state = 26)
rf = RandomForestClassifier()
knn = KNeighborsClassifier(n_neighbors = 10)
xgb = XGBClassifier(booster = 'gbtree')
svc = SVC()

## Decision Tree Classifier

In [56]:
# fitting the model
dt.fit(X_train, y_train)

# predictions
y_pred = dt.predict(X_test)

# print score
print_score(y_test, y_pred)

Accuracy : 0.799%
Recall : 0.635%
Precision : 0.755%
f1 Score : 0.69%


In [57]:
# Hyperparameter tuning for DT
dt_grid = {'max_depth': [3,4,5,6,7,8], 'min_samples_leaf': [2,4,6,8,10], 'min_samples_split':
       [2,3,4,5,6,7,8,9,10]}

dt_cv = GridSearchCV(DecisionTreeClassifier(), dt_grid, cv=10, n_jobs=-1, verbose=1)

In [58]:
dt_cv.fit(X_train, y_train)

Fitting 10 folds for each of 270 candidates, totalling 2700 fits


GridSearchCV(cv=10, estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'max_depth': [3, 4, 5, 6, 7, 8],
                         'min_samples_leaf': [2, 4, 6, 8, 10],
                         'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10]},
             verbose=1)

In [59]:
best_dt = dt_cv.best_estimator_

In [60]:
# fit the model
best_dt.fit(X_train, y_train)

# predictions
y_pred = best_dt.predict(X_test)

# print score
print_score(y_test, y_pred)

Accuracy : 0.81%
Recall : 0.683%
Precision : 0.754%
f1 Score : 0.717%


## Random Forest Classifier

In [61]:
# fit the model
rf.fit(X_train, y_train)

# predictions
y_pred = rf.predict(X_test)

# print score
print_score(y_test, y_pred)

Accuracy : 0.81%
Recall : 0.667%
Precision : 0.764%
f1 Score : 0.712%


In [62]:
# Hyperparameter tuning for rf
rf_grid = {'criterion':['gini', 'entropy', 'log_loss'], 'max_depth': [3,4,5,6,7,8], 'min_samples_leaf': [2,3,4,5,6,7,8,9,10],
           'min_samples_split': [2,3,4,5,6,7,8]}

rf_cv = GridSearchCV(RandomForestClassifier(), rf_grid, cv=10, n_jobs=-1, verbose=1)

In [63]:
# fit the model
rf_cv.fit(X_train, y_train)

Fitting 10 folds for each of 1134 candidates, totalling 11340 fits




GridSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy', 'log_loss'],
                         'max_depth': [3, 4, 5, 6, 7, 8],
                         'min_samples_leaf': [2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'min_samples_split': [2, 3, 4, 5, 6, 7, 8]},
             verbose=1)

In [64]:
best_rf = rf_cv.best_estimator_

In [65]:
# fit data
best_rf.fit(X_train, y_train)

# predict data
y_pred = best_rf.predict(X_test)

# print score
print_score(y_test, y_pred)

Accuracy : 0.821%
Recall : 0.667%
Precision : 0.792%
f1 Score : 0.724%


In [66]:
RandomForestClassifier().get_params().keys()

dict_keys(['bootstrap', 'ccp_alpha', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'])

## K-Nearest Neighbors

In [67]:
# fit model
knn.fit(X_train, y_train)

# predictions
y_pred = knn.predict(X_test)

# model score
print_score(y_test, y_pred)

Accuracy : 0.799%
Recall : 0.667%
Precision : 0.737%
f1 Score : 0.7%


In [68]:
# Hyperparameter tuning in knn
knn_grid = {'n_neighbors' : list(range(3,100)), 'p' : [1,2]}

knn_cv = GridSearchCV(KNeighborsClassifier(), knn_grid, cv=10, n_jobs=-1, verbose=1)

In [69]:
knn_cv.fit(X_train, y_train)

Fitting 10 folds for each of 194 candidates, totalling 1940 fits


GridSearchCV(cv=10, estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'n_neighbors': [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
                                         14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
                                         24, 25, 26, 27, 28, 29, 30, 31, 32, ...],
                         'p': [1, 2]},
             verbose=1)

In [70]:
best_knn = knn_cv.best_estimator_

In [71]:
# fit model
best_knn.fit(X_train, y_train)

# predictions
y_pred = best_knn.predict(X_test)

# print score
print_score(y_test, y_pred)

Accuracy : 0.799%
Recall : 0.667%
Precision : 0.737%
f1 Score : 0.7%


## XGBoost

In [72]:
# fit data
xgb.fit(X_train, y_train)

# predictions
y_pred = xgb.predict(X_test)

# print scores
print_score(y_test, y_pred)

Accuracy : 0.804%
Recall : 0.667%
Precision : 0.75%
f1 Score : 0.706%


In [73]:
# Hyperparameter tuning in xgb
xgb_grid = {'eta': [0.3, 0.4, 0.5, 0.6, 0.7, 0.8], "learning_rate": [0.01, 0.05, 0.1], "n_estimators": 
            [100, 500, 1000]}

xgb_cv = GridSearchCV(XGBClassifier(booster = 'gbtree'), xgb_grid, cv=10, n_jobs=-1, verbose=1)

In [74]:
xgb_cv.fit(X_train, y_train)

Fitting 10 folds for each of 54 candidates, totalling 540 fits


GridSearchCV(cv=10,
             estimator=XGBClassifier(base_score=None, booster='gbtree',
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=

In [75]:
best_xgb = xgb_cv.best_estimator_

In [76]:
# fit model
best_xgb.fit(X_train, y_train)

# predictions
y_pred = best_xgb.predict(X_test)

# print score
print_score(y_test, y_pred)

Accuracy : 0.799%
Recall : 0.667%
Precision : 0.737%
f1 Score : 0.7%


## SVM

In [77]:
# fit data
svc.fit(X_train, y_train)

# predictions
y_pred = svc.predict(X_test)

# print scores
print_score(y_test, y_pred)

Accuracy : 0.816%
Recall : 0.714%
Precision : 0.75%
f1 Score : 0.732%


In [78]:
# Hyperparameter tuning in svc
svc_grid = {'kernel': ['rbf', 'poly', 'sigmoid'], 'degree':[2,3,4,5,6,7], 'gamma': np.linspace(0,1,10).tolist()}

svc_cv = GridSearchCV(SVC(), svc_grid, cv=10, n_jobs=-1, verbose=1)

In [79]:
svc_cv.fit(X_train, y_train)

Fitting 10 folds for each of 180 candidates, totalling 1800 fits


GridSearchCV(cv=10, estimator=SVC(), n_jobs=-1,
             param_grid={'degree': [2, 3, 4, 5, 6, 7],
                         'gamma': [0.0, 0.1111111111111111, 0.2222222222222222,
                                   0.3333333333333333, 0.4444444444444444,
                                   0.5555555555555556, 0.6666666666666666,
                                   0.7777777777777777, 0.8888888888888888,
                                   1.0],
                         'kernel': ['rbf', 'poly', 'sigmoid']},
             verbose=1)

In [80]:
best_svc = svc_cv.best_estimator_

In [81]:
# fit model
best_svc.fit(X_train, y_train)

# predictions
y_pred = best_svc.predict(X_test)

# print score
print_score(y_test, y_pred)

Accuracy : 0.827%
Recall : 0.667%
Precision : 0.808%
f1 Score : 0.73%


# Evaluation

Purely on accuracy score, best_svc and best_rf have the highest accuracy of 0.816%

**best_svc : 0.827**

**best_rf : 0.821**

**best_dt : 0.810**

**xgb : 0.804**

**best_knn : 0.799**

## Best Model

In [86]:
# use best_rf to predict on test set
y_predictions = best_rf.predict(test)

In [87]:
y_predictions = pd.DataFrame(y_predictions)

In [88]:
output = pd.DataFrame({'PassengerId': test_id.iloc[:, 0], 'Survived': y_predictions.iloc[:, 0]})

# Submission

In [89]:
output.to_csv('submission.csv', index=False)

print("Your submission was successfully saved!")

Your submission was successfully saved!
