# Titanic: Machine Learning from Disaster

Load in some libraries.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import pprint

Load in the training and test datasets.

In [2]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

Let's have a look at the structure in the data.

In [3]:
train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [4]:
test.head(10)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S


In [5]:
# Let's see how many rows and columns we have got.
train.shape

(891, 12)

Let's see whether we have any NaN values in our training and test datasets.

In [6]:
train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [7]:
test.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

When we look at the data, we see that each passenger has a title like Mr., Mrs., etc. before his name. We can use these titles to gain additional information.

In [8]:
train_titles = train['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]

In [9]:
train_titles.isna().sum()

0

In [10]:
print(train_titles.unique())

['Mr' 'Mrs' 'Miss' 'Master' 'Don' 'Rev' 'Dr' 'Mme' 'Ms' 'Major' 'Lady'
 'Sir' 'Mlle' 'Col' 'Capt' 'the Countess' 'Jonkheer']


It's good that every passenger has a title.

In [11]:
train['Title'] = train_titles

Let's say we want to fill the missing Age values with the mean Age of passengers (or any other value that 
already exists in the 'Age' column). In this case, we will put many passengers into more or less the same age group. 
But what if most passengers with roughly the mean age actually had died or survived the disaster so that our 
classification algorithm could have made a separation between those and the passengers with other age values.
So, we might lose some of the information in the data. Thus, we should better fill with a value that does not exist in
the Age column and thereby make sure that we will not distort the data. I chose a huge number 333 for missing age values.

In [12]:
train['Age'].fillna(333, inplace=True)

Let's extract the letter and number from the Cabin column.

In [13]:
train['Cab_letter'] = train['Cabin'].str.extract(r'(\w).*')[0] 
train['Cab_letter'] = train['Cab_letter'].fillna('ZZZ') # Because there is no ZZZ cabin letter already existing.

In [14]:
train['Cab_nr'] = train['Cabin'].str.extract(r'(\w)(\d*$)')[1]
train['Cab_nr'] = pd.to_numeric(train['Cab_nr'], errors='coerce')

In [15]:
train['Cab_nr'].max()

148.0

Maximum Cab_nr is 148, we can fillna with a greater value like 2222

In [16]:
train['Cab_nr'].fillna(2222, inplace=True)

In [17]:
# Two missing Embarked values
emb_mode = train['Embarked'].mode()[0]
train['Embarked'].fillna(emb_mode, inplace=True)

To try different machine learning algorithms on our data, we can transform the categorical variables into numeric variables using an encoder.

In [18]:
for feature in train.columns:
    print(feature, train[feature].dtype)

PassengerId int64
Survived int64
Pclass int64
Name object
Sex object
Age float64
SibSp int64
Parch int64
Ticket object
Fare float64
Cabin object
Embarked object
Title object
Cab_letter object
Cab_nr float64


Of course, it does not make sense to include the Name column. We already divided the Cabin column into two features; Cab_letter and Cab_nr, so we will not use the Cabin column either. We will also skip the Ticket column. Then, we are left with Sex, Embarked, Title, Cab_letter as 'object' datatypes and the others as numeric data. So, we will only encode these four categorical features.

In [19]:
features_to_encode = ['Cab_letter', 'Embarked', 'Sex', 'Title']
# Encoding selected features with Label Encoder
enc = LabelEncoder()
for col in features_to_encode:
    colname = col[:3] + '_lab'
    train[colname] = enc.fit_transform(train[col])

In [20]:
for feature in train.columns:
    print(feature, train[feature].dtype)

PassengerId int64
Survived int64
Pclass int64
Name object
Sex object
Age float64
SibSp int64
Parch int64
Ticket object
Fare float64
Cabin object
Embarked object
Title object
Cab_letter object
Cab_nr float64
Cab_lab int32
Emb_lab int32
Sex_lab int32
Tit_lab int32


Now we can create a list of input features going into our model.

In [21]:
input_features = ['Cab_lab', 'Emb_lab', 'Sex_lab', 'Tit_lab', 'Cab_nr', 'Fare', 'SibSp', 'Parch', 'Pclass', 'Age']

We will do similar transformations in the test dataset.

In [22]:
test_titles = test['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]

In [23]:
test_titles.isna().sum()

0

In [24]:
print(test_titles.unique())

['Mr' 'Mrs' 'Miss' 'Master' 'Ms' 'Col' 'Rev' 'Dr' 'Dona']


Each passenger in the test dataset has a title but some of the titles exist only in the test dataset and not in the training set and vice versa. Also, I have searched for 'Don' and 'Dona' titles on the internet and seen that they are the same thing just written different. 

In [25]:
test_titles[test_titles == 'Dona'] = 'Don'

In [26]:
test['Title'] = test_titles

In [27]:
test['Age'].fillna(333, inplace=True)

In [28]:
mean_fare = test['Fare'].mean()
test['Fare'].fillna(mean_fare, inplace=True)

In [29]:
test['Cab_letter'] = test['Cabin'].str.extract(r'(\w).*')[0] 

In [30]:
test['Cab_letter'].fillna('ZZZ', inplace=True)

In [31]:
test['Cab_nr'] = test['Cabin'].str.extract(r'(\w)(\d*$)')[1]

In [32]:
test['Cab_nr'] = pd.to_numeric(test['Cab_nr'], errors='coerce')

In [33]:
test['Cab_nr'].isna().sum()

329

In [34]:
test['Cab_nr'].fillna(2222, inplace=True)

Now, we can encode the categorical variables for the test set. Yet, we have to make sure that each category from each column gets the same numeric value in the training dataset.

In [35]:
train_title_encodings = train[['Title', 'Tit_lab']].drop_duplicates()

In [36]:
train_title_encodings

Unnamed: 0,Title,Tit_lab
0,Mr,11
1,Mrs,12
2,Miss,8
7,Master,7
30,Don,2
149,Rev,14
245,Dr,3
369,Mme,10
443,Ms,13
449,Major,6


In [37]:
train_embarked_encodings = train[['Embarked', 'Emb_lab']].drop_duplicates()

In [38]:
train_embarked_encodings

Unnamed: 0,Embarked,Emb_lab
0,S,2
1,C,0
5,Q,1


In [40]:
train_cabletter_encodings = train[['Cab_letter', 'Cab_lab']].drop_duplicates()

In [41]:
train_cabletter_encodings

Unnamed: 0,Cab_letter,Cab_lab
0,ZZZ,8
1,C,2
6,E,4
10,G,6
21,D,3
23,A,0
31,B,1
66,F,5
339,T,7


In [42]:
train_sex_encodings = train[['Sex', 'Sex_lab']].drop_duplicates()

In [43]:
train_sex_encodings

Unnamed: 0,Sex,Sex_lab
0,male,1
1,female,0


Now, we can apply the same encodings in the test set.

In [44]:
feature_dict = {'Sex': train_sex_encodings, 'Cab_letter':train_cabletter_encodings, 
                'Embarked':train_embarked_encodings, 'Title':train_title_encodings}

In [45]:
# Let's remember features_to_encode
features_to_encode

['Cab_letter', 'Embarked', 'Sex', 'Title']

In [46]:
# Encoding with the same numeric values
for feature in features_to_encode:
    enc_col_name = feature[:3] + '_lab'
    for enc_val in train[feature].unique():
        series = test[feature]
        index = series[series == enc_val].index
        encoding_df = feature_dict[feature]
        value = encoding_df[enc_col_name][encoding_df[feature] == enc_val].iloc[0]
        test.loc[index, enc_col_name] = value

In [47]:
# The encoded features in the test set happen to have datatypes that are different from the ones in the training set.
# This might affect our model.
for feature in test.columns:
    if train[feature].dtype != test[feature].dtype:
        print(f'{feature}, training_set: {train[feature].dtype}, test_set: {test[feature].dtype}')

Cab_lab, training_set: int32, test_set: float64
Emb_lab, training_set: int32, test_set: float64
Sex_lab, training_set: int32, test_set: float64
Tit_lab, training_set: int32, test_set: float64


In [48]:
# Converting encoded features in the test set to int32.
int_type = 'int32'
enc_features = ['Cab_lab', 'Emb_lab', 'Sex_lab', 'Tit_lab']
for feature in enc_features:
    test[feature] = test[feature].astype(int_type)

Now that everything seems fine we can apply our Random Forest model and determine our hyperparameters.

Let's create a grid of values that from which our model will choose randomly and try a broad range of values.

In [49]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint.pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [232]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
clf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = clf, param_distributions = random_grid, n_iter = 100, cv = 5, verbose = 2, n_jobs = -1)
# Fit the random search model
rf_random.fit(train[input_features], train['Survived'])

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   43.9s
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 361 tasks      | elapsed: 14.1min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 20.0min finished


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   verbose=2)

In [234]:
rf_random.best_params_

{'n_estimators': 600,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 70,
 'bootstrap': True}

Since we found out a 'rough' set of best of hyper parameters, we can now narrow our range of values.

In [49]:
param_grid = {'bootstrap': [True],
              'max_depth': [60, 70, 80, 90, 100],
              'max_features': [2, 3],
              'min_samples_leaf': [2, 3],
              'min_samples_split': [3, 5, 7],
              'n_estimators': [300, 400, 500, 1000]
             }

In [265]:
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = clf, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2)

In [266]:
# Fit the grid search to the data
grid_search.fit(train[input_features], train['Survived'])

Fitting 5 folds for each of 240 candidates, totalling 1200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   34.1s
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 361 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done 644 tasks      | elapsed: 12.9min
[Parallel(n_jobs=-1)]: Done 1009 tasks      | elapsed: 22.8min
[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed: 27.7min finished


GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'bootstrap': [True],
                         'max_depth': [60, 70, 80, 90, 100],
                         'max_features': [2, 3], 'min_samples_leaf': [2, 3],
                         'min_samples_split': [3, 5, 7],
                         'n_estimators': [300, 400, 500, 1000]},
             verbose=2)

In [267]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 70,
 'max_features': 3,
 'min_samples_leaf': 2,
 'min_samples_split': 7,
 'n_estimators': 500}

In [268]:
grid_search.best_score_

0.8327663046889713

Now, we can simply use our hyper parameters to make the predictions for the test set. 

In [270]:
clf = RandomForestClassifier(**grid_search.best_params_)
clf.fit(train[input_features], train['Survived'])
preds = clf.predict(test[input_features])
csv_df = pd.DataFrame()
csv_df['PassengerId'] = test['PassengerId']
csv_df['Survived'] = preds
csv_df.to_csv('sub.csv', index=False)

We have created the csv file and the file is ready to submit.