# Titanic

### Imports & Loading Data

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from IPython.display import display

In [2]:
# Load data
train_raw = pd.read_csv('train.csv')
test_raw = pd.read_csv('test.csv')

print(train_raw.info())
print(test_raw.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pcl

In [3]:
# Combine test and train for preprocessing
data = pd.concat([train_raw, test_raw], sort = False)

# Data Exploration

In [4]:
# Print few training examples and dtypes
display(data.head(10))
print(data.dtypes)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0.0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0.0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0.0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1.0,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1.0,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


PassengerId      int64
Survived       float64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


**Explore the Training Set**

In [5]:
num_records = data.shape[0]                     # Total number of records
num_surv = data['Survived'].value_counts()[1]   # Number of survivors
num_death = data['Survived'].value_counts()[0]  # Number of deaths


# Print analysis
print('Total number of records: {}'.format(num_records))
print('Number of survivors (train): {}'.format(num_surv))
print('Number of deaths(train): {}'.format(num_death))
print('Percentage of deaths: {}%'.format(round(100 * num_death/(num_surv+num_death), 1)))

# Print Correlation between Features
print('\nCorrelation between numerical Features: ')
display(data.corr())

Total number of records: 1309
Number of survivors (train): 342
Number of deaths(train): 549
Percentage of deaths: 61.6%

Correlation between numerical Features: 


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.005007,-0.038354,0.028814,-0.055224,0.008942,0.031428
Survived,-0.005007,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307
Pclass,-0.038354,-0.338481,1.0,-0.408106,0.060832,0.018322,-0.558629
Age,0.028814,-0.077221,-0.408106,1.0,-0.243699,-0.150917,0.17874
SibSp,-0.055224,-0.035322,0.060832,-0.243699,1.0,0.373587,0.160238
Parch,0.008942,0.081629,0.018322,-0.150917,0.373587,1.0,0.221539
Fare,0.031428,0.257307,-0.558629,0.17874,0.160238,0.221539,1.0


# Pre-Processing

## NaN Values

Here we explore the NaN values in our training and testing sets.

In [6]:
# Count total NaNs and non-NaNs
num_nan = data.isnull().sum().sum()
num_non_nan = data.count().sum()

# Print total NaN
print('Number of NaNs: {}'.format(num_nan))
print('Number of non-NaNs: {}\n'.format(num_non_nan))

# Print NaN for each feature
print('NaN by feature: \n{}\n'.format(data.isnull().sum()))

Number of NaNs: 1698
Number of non-NaNs: 14010

NaN by feature: 
PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64



## Cleaning NaN
##### Key Points for NaN

- About 10% of our values are NaN (not including Survived in test set)
- NaN's reside almost exclusively in the "Age" and "Cabin" features for both the training and the testing set

**Age & Fare NaNs**

Because our dataset is quite small, we will not delete any inputs containing NaN values. Instead, we will try other techniques.

For the 'Age' and 'Fare' NaNs, we will replace with the median age in the data (hoping that the error won't be too large). 

In [7]:
# Deal with Age and Fare NaNs
data['Age'] = data['Age'].fillna(value=data['Age'].median(), axis = 0)
data['Fare'] = data['Fare'].fillna(value=data['Fare'].median(), axis = 0)

# Print NaN for each feature
print('NaN by feature: \n{}\n'.format(data.isnull().sum()))

NaN by feature: 
PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age               0
SibSp             0
Parch             0
Ticket            0
Fare              0
Cabin          1014
Embarked          2
dtype: int64



**Embarked NaNs**

Since we only have 2 NaNs in Embarked and the most popular class is by far "S" (see below), we will replace the NaNs with S. Then, we will only have to deal with the Cabin NaNs

In [8]:
display(data['Category'].value_counts())

data['Category'] = data['Category'].fillna(value='S')
print('\nNaN by feature: \n{}\n'.format(data.isnull().sum()))

S    914
C    270
Q    123
Name: Embarked, dtype: int64


NaN by feature: 
PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age               0
SibSp             0
Parch             0
Ticket            0
Fare              0
Cabin          1014
Embarked          0
dtype: int64



**Cabin NaNs**

Noticing that all Cabin values start with the classification letter, let's replace the individual cabins with the corresponding letter, assuming that the individual cabins don't have as much of an effect as the class they belong in. 

In [9]:
# Replace NaNs with Unknown
data['Cabin'] = data['Cabin'].fillna(value='Unknown')
# Keep only first letter
data['Cabin'] = data['Cabin'].apply(lambda s: s[0])
# Print 
display(data['Cabin'].value_counts())

print('\nNaN by feature: \n{}\n'.format(data.isnull().sum()))

U    1014
C      94
B      65
D      46
E      41
A      22
F      21
G       5
T       1
Name: Cabin, dtype: int64


NaN by feature: 
PassengerId      0
Survived       418
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin            0
Embarked         0
dtype: int64



## Feature Engineering

Since the size of our dataset is small, it would help to give our model more features to work with. 
Let's add in (Age * Pclass) as a feature as well as the family size.
Finally, let's include the fare per person.

In [10]:
# Age * Class
data['Age*Class'] = data['Age'] * data['Pclass']
# Family Size
data['Family_Size']= data['SibSp'] + data['Parch']
# Fare per Person
data['Fare_Per_Person']= data['Fare'] / (data['Family_Size']+1)

# Testing set
data['Age*Class'] = data['Age'] * data['Pclass']
data['Family_Size']= data['SibSp'] + data['Parch']
data['Fare_Per_Person']= data['Fare'] / (data['Family_Size']+1)

display(data)
display(data.corr())

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age*Class,Family_Size,Fare_Per_Person
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,U,S,66.0,1,3.625000
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,C,38.0,1,35.641650
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,U,S,78.0,0,7.925000
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C,S,35.0,1,26.550000
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,U,S,105.0,0,8.050000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,,3,"Spector, Mr. Woolf",male,28.0,0,0,A.5. 3236,8.0500,U,S,84.0,0,8.050000
414,1306,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C,C,39.0,0,108.900000
415,1307,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,U,S,115.5,0,7.250000
416,1308,,3,"Ware, Mr. Frederick",male,28.0,0,0,359309,8.0500,U,S,84.0,0,8.050000


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Age*Class,Family_Size,Fare_Per_Person
PassengerId,1.0,-0.005007,-0.038354,0.025799,-0.055224,0.008942,0.031128,-0.021699,-0.031437,0.035217
Survived,-0.005007,1.0,-0.338481,-0.06491,-0.035322,0.081629,0.257307,-0.324559,0.016639,0.2216
Pclass,-0.038354,-0.338481,1.0,-0.377908,0.060832,0.018322,-0.558683,0.527534,0.050027,-0.504237
Age,0.025799,-0.06491,-0.377908,1.0,-0.189972,-0.125851,0.178182,0.522331,-0.193742,0.188546
SibSp,-0.055224,-0.035322,0.060832,-0.189972,1.0,0.373587,0.160349,-0.172068,0.861952,-0.089736
Parch,0.008942,0.081629,0.018322,-0.125851,0.373587,1.0,0.221635,-0.104951,0.792296,-0.065435
Fare,0.031128,0.257307,-0.558683,0.178182,0.160349,0.221635,1.0,-0.33556,0.226609,0.832025
Age*Class,-0.021699,-0.324559,0.527534,0.522331,-0.172068,-0.104951,-0.33556,1.0,-0.170542,-0.271237
Family_Size,-0.031437,0.016639,0.050027,-0.193742,0.861952,0.792296,0.226609,-0.170542,1.0,-0.094789
Fare_Per_Person,0.035217,0.2216,-0.504237,0.188546,-0.089736,-0.065435,0.832025,-0.271237,-0.094789,1.0


## Normalise Numerical Features

**Normalise**: Age, SibSp, Parch, Fare, Age*Class, Family_Size, Fare_Per_Person

In [11]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
numerical_features = ['Age', 'SibSp', 'Parch', 'Fare', 'Age*Class', 'Family_Size', 'Fare_Per_Person']

data[numerical_features] = scaler.fit_transform(data[numerical_features])

display(data)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age*Class,Family_Size,Fare_Per_Person
0,1,0.0,3,"Braund, Mr. Owen Harris",male,0.273456,0.125,0.000000,A/5 21171,0.014151,U,S,0.295679,0.1,0.007076
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,0.473882,0.125,0.000000,PC 17599,0.139136,C,C,0.169263,0.1,0.069568
2,3,1.0,3,"Heikkinen, Miss. Laina",female,0.323563,0.000,0.000000,STON/O2. 3101282,0.015469,U,S,0.349858,0.0,0.015469
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,0.436302,0.125,0.000000,113803,0.103644,C,S,0.155718,0.1,0.051822
4,5,0.0,3,"Allen, Mr. William Henry",male,0.436302,0.000,0.000000,373450,0.015713,U,S,0.471759,0.0,0.015713
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,,3,"Spector, Mr. Woolf",male,0.348616,0.000,0.000000,A.5. 3236,0.015713,U,S,0.376947,0.0,0.015713
414,1306,,1,"Oliva y Ocana, Dona. Fermina",female,0.486409,0.000,0.000000,PC 17758,0.212559,C,C,0.173778,0.0,0.212559
415,1307,,3,"Saether, Mr. Simon Sivertsen",male,0.480145,0.000,0.000000,SOTON/O.Q. 3101262,0.014151,U,S,0.519166,0.0,0.014151
416,1308,,3,"Ware, Mr. Frederick",male,0.348616,0.000,0.000000,359309,0.015713,U,S,0.376947,0.0,0.015713


## One-Hot Encoding

**Assume** 0 correlation between Name/Ticket and survivability rate. Remove the Names and Ticket columns. 

**NB**: Useful info can be extracted from the Names column. This is because we can classify males, females, captains, etc. I will edit this in the future. 

In [12]:
data = data.drop(['Name', 'Ticket'], axis = 1)

**One-Hot Encode**: Pclass, Sex, Ticket, Cabin, Embarked

In [13]:
# Change Pclass to string to encode
data = data.astype({'Pclass': str})
print(data.dtypes)
# Get encoded
data = pd.get_dummies(data)

# Print the number of features after one-hot encoding
encoded = list(data.columns)
print("\n{} total features after one-hot encoding.\n".format(len(encoded)))
display(data)

PassengerId          int64
Survived           float64
Pclass              object
Sex                 object
Age                float64
SibSp              float64
Parch              float64
Fare               float64
Cabin               object
Embarked            object
Age*Class          float64
Family_Size        float64
Fare_Per_Person    float64
dtype: object

26 total features after one-hot encoding.



Unnamed: 0,PassengerId,Survived,Age,SibSp,Parch,Fare,Age*Class,Family_Size,Fare_Per_Person,Pclass_1,...,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_U,Embarked_C,Embarked_Q,Embarked_S
0,1,0.0,0.273456,0.125,0.000000,0.014151,0.295679,0.1,0.007076,0,...,0,0,0,0,0,0,1,0,0,1
1,2,1.0,0.473882,0.125,0.000000,0.139136,0.169263,0.1,0.069568,1,...,1,0,0,0,0,0,0,1,0,0
2,3,1.0,0.323563,0.000,0.000000,0.015469,0.349858,0.0,0.015469,0,...,0,0,0,0,0,0,1,0,0,1
3,4,1.0,0.436302,0.125,0.000000,0.103644,0.155718,0.1,0.051822,1,...,1,0,0,0,0,0,0,0,0,1
4,5,0.0,0.436302,0.000,0.000000,0.015713,0.471759,0.0,0.015713,0,...,0,0,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,,0.348616,0.000,0.000000,0.015713,0.376947,0.0,0.015713,0,...,0,0,0,0,0,0,1,0,0,1
414,1306,,0.486409,0.000,0.000000,0.212559,0.173778,0.0,0.212559,1,...,1,0,0,0,0,0,0,1,0,0
415,1307,,0.480145,0.000,0.000000,0.014151,0.519166,0.0,0.014151,0,...,0,0,0,0,0,0,1,0,0,1
416,1308,,0.348616,0.000,0.000000,0.015713,0.376947,0.0,0.015713,0,...,0,0,0,0,0,0,1,0,0,1


## Split

Split testing and training data, and split testing into features and labels

In [14]:
data = data.drop('PassengerId', axis=1)

In [45]:
# Split train and test
train_data = data[data['Survived'].notna()]
test_data = data[data['Survived'].isna()]
test_data = test_data.drop('Survived', axis=1)

# Split train into features and labels
train_features = train_data.drop('Survived', axis=1)
train_labels = train_data['Survived']

display(test_data)
display(train_features)
display(train_labels)

Unnamed: 0,Age,SibSp,Parch,Fare,Age*Class,Family_Size,Fare_Per_Person,Pclass_1,Pclass_2,Pclass_3,...,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_U,Embarked_C,Embarked_Q,Embarked_S
0,0.430039,0.000,0.000000,0.015282,0.464987,0.0,0.015282,0,0,1,...,0,0,0,0,0,0,1,0,1,0
1,0.586622,0.125,0.000000,0.013663,0.634295,0.1,0.006832,0,0,1,...,0,0,0,0,0,0,1,0,0,1
2,0.774521,0.000,0.000000,0.018909,0.557542,0.0,0.018909,0,1,0,...,0,0,0,0,0,0,1,0,1,0
3,0.336089,0.000,0.000000,0.016908,0.363402,0.0,0.016908,0,0,1,...,0,0,0,0,0,0,1,0,0,1
4,0.273456,0.125,0.111111,0.023984,0.295679,0.2,0.007995,0,0,1,...,0,0,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,0.348616,0.000,0.000000,0.015713,0.376947,0.0,0.015713,0,0,1,...,0,0,0,0,0,0,1,0,0,1
414,0.486409,0.000,0.000000,0.212559,0.173778,0.0,0.212559,1,0,0,...,1,0,0,0,0,0,0,1,0,0
415,0.480145,0.000,0.000000,0.014151,0.519166,0.0,0.014151,0,0,1,...,0,0,0,0,0,0,1,0,0,1
416,0.348616,0.000,0.000000,0.015713,0.376947,0.0,0.015713,0,0,1,...,0,0,0,0,0,0,1,0,0,1


Unnamed: 0,Age,SibSp,Parch,Fare,Age*Class,Family_Size,Fare_Per_Person,Pclass_1,Pclass_2,Pclass_3,...,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_U,Embarked_C,Embarked_Q,Embarked_S
0,0.273456,0.125,0.000000,0.014151,0.295679,0.1,0.007076,0,0,1,...,0,0,0,0,0,0,1,0,0,1
1,0.473882,0.125,0.000000,0.139136,0.169263,0.1,0.069568,1,0,0,...,1,0,0,0,0,0,0,1,0,0
2,0.323563,0.000,0.000000,0.015469,0.349858,0.0,0.015469,0,0,1,...,0,0,0,0,0,0,1,0,0,1
3,0.436302,0.125,0.000000,0.103644,0.155718,0.1,0.051822,1,0,0,...,1,0,0,0,0,0,0,0,0,1
4,0.436302,0.000,0.000000,0.015713,0.471759,0.0,0.015713,0,0,1,...,0,0,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0.336089,0.000,0.000000,0.025374,0.241501,0.0,0.025374,0,1,0,...,0,0,0,0,0,0,1,0,0,1
887,0.235876,0.000,0.000000,0.058556,0.083480,0.0,0.058556,1,0,0,...,0,0,0,0,0,0,0,0,0,1
888,0.348616,0.125,0.222222,0.045771,0.376947,0.3,0.011443,0,0,1,...,0,0,0,0,0,0,1,0,0,1
889,0.323563,0.000,0.000000,0.058556,0.115084,0.0,0.058556,1,0,0,...,1,0,0,0,0,0,0,1,0,0


0      0.0
1      1.0
2      1.0
3      1.0
4      0.0
      ... 
886    0.0
887    1.0
888    0.0
889    1.0
890    0.0
Name: Survived, Length: 891, dtype: float64

# Models

Now that the pre-processing is done, we are ready for the ML part!

Split data into training and validation sets

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(train_features, 
                                                    train_labels, 
                                                    test_size = 0.2, 
                                                    random_state = 0)

## Naive Predictor

To have something to compare our model to, we'll calculate a *naive perdictor* accuracy.

Since more than 50% of people died, our naive predictor will assume that everyone died and we'll calculate its accuracy.

**NB:** Our metric of choice will be accuracy (instead of f-score) since there is not great imbalance in the classes (and since this is what Kaggle requests)

In [17]:
# Calculate and print "naive accuracy"
naive_accuracy = 100* num_death / (num_surv + num_death)

print('The naive prediction is: {}%'.format(round(naive_accuracy, 1)))

The naive prediction is: 61.6%


## Import and Instantiate Algorithm Classes

We will try out Multinomial and Gaussian Naive Bayes, SVMs, Decision Trees and Ensemble Methods (boosting, bagging, random forests)

In [18]:
# Ensemble Methods
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier  # Decision Trees
from sklearn.naive_bayes import MultinomialNB    # Naive Bayes
from sklearn.naive_bayes import GaussianNB       # Gaussian Naive Bayes
from sklearn.svm import SVC                      # SVM
import random

# Instantiate classes
random.seed(42)
decision_tree = DecisionTreeClassifier(random_state=42)
naive_bayes = MultinomialNB()
naive_bayes_g = GaussianNB()
bagging = BaggingClassifier(random_state=42, n_jobs=-1)
random_forest = RandomForestClassifier(random_state=42, n_jobs=-1)
svm = SVC(random_state=42)
adaboost = AdaBoostClassifier(random_state=42)

## Hypermarameter Tuning

Evaluate each models performance on a cross-validation set using grid search.

In [19]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from time import time




# Naive Bayes
alpha = [1]

# Bagging
n_estimators_bagging = [10, 20, 30, 50, 100, 200, 300]
max_samples_bagging = [0.1, 0.2, 0.5, 1.0, 2.0, 3.0, 5.0]
max_features_bagging = [0.5, 1.0, 2.0, 3.0, 50.0, 200]

# Random Forests
criterion = ['gini', 'entropy']
n_estimators_rf = [100, 150, 200]
max_depth_rf = [None, 1, 3, 5, 10]
min_samples_leaf_rf = [5, 10]
min_samples_split_rf = [5, 10]


# SVC
kernel = ['rbf']
c_parameter = [0.0001, 0.001, 0.01, 0.1, 1, 10]
gamma = [0.0001, 0.001, 0.01, 0.1, 1]


# AdaBoost
n_estimators_ada = [10, 30, 50, 100, 200, 500]
learning_rate = [0.001, 0.01, 0.1, 0.5, 1, 1.5, 2]






# Hyperparameters
naive_bayes_parameters = {'alpha': alpha}
naive_bayes_g_parameters = {'var_smoothing': np.logspace(0,-9, num=1000)}
bagging_parameters = {'n_estimators': n_estimators_bagging, 'max_features': max_features_bagging, 'max_samples': max_samples_bagging}
random_forest_parameters = {'n_estimators': n_estimators_rf, 'max_depth': max_depth_rf, 'min_samples_leaf': min_samples_leaf_rf, 'min_samples_split': min_samples_split_rf}
svm_parameters = {'kernel': kernel, 'C': c_parameter, 'gamma': gamma}
adaboost_parameters = {'n_estimators': n_estimators_ada, 'learning_rate': learning_rate}

# Scoring object using accuracy
scorer = make_scorer(accuracy_score)


clfs_param =[(naive_bayes, naive_bayes_parameters), 
             (naive_bayes_g, naive_bayes_g_parameters), 
             (bagging, bagging_parameters), 
             (random_forest, random_forest_parameters), 
             (svm, svm_parameters), 
             (adaboost, adaboost_parameters)]

In [20]:
# Create list to store models
models = []
unopt_accuracies = []
accuracies = []

# Perform grid search
for clf, parameter in clfs_param:
    print('\n{}\n'.format(clf.__class__.__name__))
    
    grid_obj = GridSearchCV(clf, parameter, scoring=scorer, n_jobs = -1)
    
    # Perform grid search
    start = time()
    grid_fit = grid_obj.fit(X_train, y_train)
    end = time()
    print('Time to tune: {}s\n'.format(round(end - start), 2))
    
    # Get best estimator
    best_clf = grid_fit.best_estimator_
    models.append(best_clf)
    
    # Make predictions using the unoptimized and model
    predictions = (clf.fit(X_train, y_train)).predict(X_valid)
    best_predictions = best_clf.predict(X_valid)
    
    predictions_train = (clf.fit(X_train, y_train)).predict(X_train)
    best_predictions_train = best_clf.predict(X_train)
    
    # Check hyperparameters
    print('Unoptomised: {}\n'.format(clf.get_params(deep = True)))
    print('Optomised: {}\n'.format(best_clf.get_params(deep = True)))
    
    # Print Results
    print("\nUnoptimised-Accuracy-Training: {:.4f}".format(accuracy_score(y_train, predictions_train)))
    print("Optimised-Accuracy-training: {:.4f}".format(accuracy_score(y_train, best_predictions_train)))
    
    print("\nUnoptimised-Accuracy-validation: {:.4f}".format(accuracy_score(y_valid, predictions)))
    print("Optimised-Accuracy-validation: {:.4f}".format(accuracy_score(y_valid, best_predictions)))
    
    print('\n \n \n=============================================================================================')
    
    unopt_accuracies.append(accuracy_score(y_valid, predictions))
    accuracies.append(accuracy_score(y_valid, best_predictions))
    
print('All unoptimised accuracies (validation): {}'.format(unopt_accuracies))
print('Best unoptimised accuracy (validation): {}\n'.format(max(unopt_accuracies)))
print('All optimised accuracies (validation): {}'.format(accuracies))
print('Best optimised accuracy (validation): {}'.format(max(accuracies)))


MultinomialNB

Time to tune: 2s

Unoptomised: {'alpha': 1.0, 'class_prior': None, 'fit_prior': True}

Optomised: {'alpha': 1, 'class_prior': None, 'fit_prior': True}


Unoptimised-Accuracy-Training: 0.7823
Optimised-Accuracy-training: 0.7823

Unoptimised-Accuracy-validation: 0.8156
Optimised-Accuracy-validation: 0.8156

 
 

GaussianNB

Time to tune: 5s

Unoptomised: {'priors': None, 'var_smoothing': 1e-09}

Optomised: {'priors': None, 'var_smoothing': 0.7175560918936926}


Unoptimised-Accuracy-Training: 0.7275
Optimised-Accuracy-training: 0.7907

Unoptimised-Accuracy-validation: 0.7151
Optimised-Accuracy-validation: 0.8380

 
 

BaggingClassifier

Time to tune: 12s

Unoptomised: {'base_estimator': None, 'bootstrap': True, 'bootstrap_features': False, 'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 10, 'n_jobs': -1, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}

Optomised: {'base_estimator': None, 'bootstrap': True, 'bootstrap_features': False, 'm

## Choose Model

I will use AdaBoost with our best model (GaussianNB) as a base estimator.

In [21]:
model_nb = models[2]
model_ada = models[-1]

ada_param_final = {'base_estimator': [None, model_nb], 'n_estimators': n_estimators_ada, 'learning_rate': learning_rate}

print('\n{}\n'.format(clf.__class__.__name__))

grid_obj = GridSearchCV(model_ada, ada_param_final, scoring=scorer, n_jobs = -1, verbose = 5)

# Perform grid search
start = time()
grid_fit = grid_obj.fit(X_train, y_train)
end = time()
print('Time to tune: {}s\n'.format(round(end - start), 2))

# Get best estimator
best_clf = grid_fit.best_estimator_
models.append(best_clf)

# Make predictions using the unoptimized and model
predictions = (model_ada.fit(X_train, y_train)).predict(X_valid)
best_predictions = best_clf.predict(X_valid)

predictions_train = (model_ada.fit(X_train, y_train)).predict(X_train)
best_predictions_train = best_clf.predict(X_train)

# Check hyperparameters
print('Unoptomised: {}\n'.format(model_ada.get_params(deep = True)))
print('Optomised: {}\n'.format(best_clf.get_params(deep = True)))

# Print Results
print("\nUnoptimised-Accuracy-Training: {:.4f}".format(accuracy_score(y_train, predictions_train)))
print("Optimised-Accuracy-training: {:.4f}".format(accuracy_score(y_train, best_predictions_train)))

print("\nUnoptimised-Accuracy-validation: {:.4f}".format(accuracy_score(y_valid, predictions)))
print("Optimised-Accuracy-validation: {:.4f}".format(accuracy_score(y_valid, best_predictions)))

print('\n \n \n=============================================================================================')

unopt_accuracies.append(accuracy_score(y_valid, predictions))
accuracies.append(accuracy_score(y_valid, best_predictions))



AdaBoostClassifier

Fitting 5 folds for each of 84 candidates, totalling 420 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  55 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 226 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 353 tasks      | elapsed: 14.0min
[Parallel(n_jobs=-1)]: Done 420 out of 420 | elapsed: 23.2min finished


Time to tune: 1418s

Unoptomised: {'algorithm': 'SAMME.R', 'base_estimator': None, 'learning_rate': 0.1, 'n_estimators': 50, 'random_state': 42}

Optomised: {'algorithm': 'SAMME.R', 'base_estimator__base_estimator': None, 'base_estimator__bootstrap': True, 'base_estimator__bootstrap_features': False, 'base_estimator__max_features': 0.5, 'base_estimator__max_samples': 0.2, 'base_estimator__n_estimators': 200, 'base_estimator__n_jobs': -1, 'base_estimator__oob_score': False, 'base_estimator__random_state': 42, 'base_estimator__verbose': 0, 'base_estimator__warm_start': False, 'base_estimator': BaggingClassifier(base_estimator=None, bootstrap=True, bootstrap_features=False,
                  max_features=0.5, max_samples=0.2, n_estimators=200,
                  n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                  warm_start=False), 'learning_rate': 0.1, 'n_estimators': 200, 'random_state': 42}


Unoptimised-Accuracy-Training: 0.8118
Optimised-Accuracy-training: 0.8975

### Final Predictions

In [46]:
model = best_clf
pred = model.predict(test_data).astype(int)
pred.shape

(418,)

## Submission

In [50]:
PassengerId = test_raw['PassengerId']
submission = pd.DataFrame({'PassengerId': PassengerId, 'Survived': pred})

In [49]:
submission.to_csv("my_submission.csv", index = False)

# Final Submission Score (test set accuracy)

![Competition Results (accuracy on test set)](./titanic_results.png)