In [123]:
import pandas as pd
import matplotlib.pyplot as plt

In [124]:
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')

In [125]:
train
# 891 rows

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [126]:
test;
# 418 rows

In [127]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [128]:
# We concat the train and test set because we are preprocessing the data together.
data = pd.concat([train, test])
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [129]:
data.describe().loc[['count']]

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,1309.0,891.0,1309.0,1046.0,1309.0,1309.0,1308.0


So, we have to fill in age, and that one missing fare.

Note that the one missing fare is in the test. So we have to come up with a method to generalise fare.

https://pandas.pydata.org/pandas-docs/version/0.22/generated/pandas.core.groupby.DataFrameGroupBy.agg.html

SQL Like: https://pandas.pydata.org/pandas-docs/stable/getting_started/comparison/comparison_with_sql.html


# Feature Engineering

In [130]:
# Drop useless features
data.drop(['PassengerId','Cabin','Ticket','Fare', 'Parch', 'SibSp'], axis=1, inplace=True)
data

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Embarked
0,0.0,3,"Braund, Mr. Owen Harris",male,22.0,S
1,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,C
2,1.0,3,"Heikkinen, Miss. Laina",female,26.0,S
3,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,S
4,0.0,3,"Allen, Mr. William Henry",male,35.0,S
...,...,...,...,...,...,...
413,,3,"Spector, Mr. Woolf",male,,S
414,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,C
415,,3,"Saether, Mr. Simon Sivertsen",male,38.5,S
416,,3,"Ware, Mr. Frederick",male,,S


## Objective
Make all categorical features (like Sex and Embarked) to contain numerical data instead.

For Sex: Use LabelEncoder from sklearn
For Embarked: Use get_dummies from pandas

In [131]:
from sklearn.preprocessing import LabelEncoder

## Sex

In [132]:
data.Sex.unique()

array(['male', 'female'], dtype=object)

In [133]:
data.Sex = LabelEncoder().fit_transform(data.Sex)

In [134]:
data

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Embarked
0,0.0,3,"Braund, Mr. Owen Harris",1,22.0,S
1,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,C
2,1.0,3,"Heikkinen, Miss. Laina",0,26.0,S
3,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,S
4,0.0,3,"Allen, Mr. William Henry",1,35.0,S
...,...,...,...,...,...,...
413,,3,"Spector, Mr. Woolf",1,,S
414,,1,"Oliva y Ocana, Dona. Fermina",0,39.0,C
415,,3,"Saether, Mr. Simon Sivertsen",1,38.5,S
416,,3,"Ware, Mr. Frederick",1,,S


# Embarked

In [135]:
data.Embarked.unique()
# We will determine that there are 'NaN' values in the Embark column.

array(['S', 'C', 'Q', nan], dtype=object)

In [136]:
# Let's count the number of null values.
# We can use builtin pandas function dedicated for NaN values
sum(data.Embarked.isnull())

2

In [137]:
# Let's inspect them
data[data['Embarked'].isnull()]
# Note that we cannot use == 'NaN', because equivalent operation does not work for NaN values.

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Embarked
61,1.0,1,"Icard, Miss. Amelie",0,38.0,
829,1.0,1,"Stone, Mrs. George Nelson (Martha Evelyn)",0,62.0,


In [138]:
# We have to give some values to these NaN values before we can proceed with preprocessing.
# For absolutely no reason, we will assign them 'S'.
bool_array = data.Embarked.isnull()
data.loc[bool_array, 'Embarked'] = 'S'

# double check that there are no more NaN values under Embarked.
sum(data.Embarked.isnull())

0

In [139]:
# We can proceed with preprocessing with use of panda's get_dummies
dummy_embark = pd.get_dummies(data.Embarked, 
                              prefix='Embarked', 
                              drop_first=True)
data = pd.concat([data, dummy_embark], axis=1) 
data.drop(['Embarked'], axis=1, inplace=True)
data

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Embarked_Q,Embarked_S
0,0.0,3,"Braund, Mr. Owen Harris",1,22.0,0,1
1,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,0,0
2,1.0,3,"Heikkinen, Miss. Laina",0,26.0,0,1
3,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,0,1
4,0.0,3,"Allen, Mr. William Henry",1,35.0,0,1
...,...,...,...,...,...,...,...
413,,3,"Spector, Mr. Woolf",1,,0,1
414,,1,"Oliva y Ocana, Dona. Fermina",0,39.0,0,0
415,,3,"Saether, Mr. Simon Sivertsen",1,38.5,0,1
416,,3,"Ware, Mr. Frederick",1,,0,1


## Next Objective

We have quite a lot of NaN values under the Age column. We cannot fill those manually the way we did for 'Embarked'. We have to employ an approach that utilises the passenger's 'Name' as an inference to approximate his/her age.


In [140]:
sum(data.Age.isnull())

263

In [141]:
s = "Spector, Mr. Woolf"

In [142]:
s.split(', ')[1].split('.')[0]

'Mr'

In [143]:
data['Name'] = data['Name'].map(lambda val : val.split(', ')[1].split('.')[0] if ( pd.notnull(val) ) else val )

In [144]:
data.Name.unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'the Countess',
       'Jonkheer', 'Dona'], dtype=object)

In [145]:
data.groupby('Name').agg('count')[['Pclass']]

Unnamed: 0_level_0,Pclass
Name,Unnamed: 1_level_1
Capt,1
Col,4
Don,1
Dona,1
Dr,8
Jonkheer,1
Lady,1
Major,2
Master,61
Miss,260


Our objective is to take the average of the passengers' ages, categorised by a combination of their **gender, title, and passenger-class**.

With regards to titles however, there are a number of titles which are only held by a handful of passengers. Like 'Sir' and 'the Countess'. We need to make titles less granular (binning them into broader categories), so that the population of each category contains more passengers.

The categories are as follows:
- Officer
- Royalty 
- Mr
- Mrs
- Miss
- Master

In [146]:
Title_Dictionary = {
    "Capt": "Officer",
    "Col": "Officer",
    "Major": "Officer",
    "Jonkheer": "Royalty",
    "Don": "Royalty",
    "Dona": "Royalty",
    "Sir" : "Royalty",
    "Dr": "Officer",
    "Rev": "Officer",
    "the Countess":"Royalty",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr" : "Mr",
    "Mrs" : "Mrs",
    "Miss" : "Miss",
    "Master" : "Master",
    "Lady" : "Royalty"
}

In [147]:
data['Title'] = data['Name'].map(lambda val : Title_Dictionary[val])
data.drop(['Name'], axis=1, inplace=True)
data

Unnamed: 0,Survived,Pclass,Sex,Age,Embarked_Q,Embarked_S,Title
0,0.0,3,1,22.0,0,1,Mr
1,1.0,1,0,38.0,0,0,Mrs
2,1.0,3,0,26.0,0,1,Miss
3,1.0,1,0,35.0,0,1,Mrs
4,0.0,3,1,35.0,0,1,Mr
...,...,...,...,...,...,...,...
413,,3,1,,0,1,Mr
414,,1,0,39.0,0,0,Royalty
415,,3,1,38.5,0,1,Mr
416,,3,1,,0,1,Mr


To avoid data leakage from train set to test set, the computation of averages (of each Title) must be done within the former (the non-NaN values within the train set), after which we will then copy over to the combined dataset whose Ages are NaN.

In [148]:
train = data[ data['Survived'].notnull() ]
train

Unnamed: 0,Survived,Pclass,Sex,Age,Embarked_Q,Embarked_S,Title
0,0.0,3,1,22.0,0,1,Mr
1,1.0,1,0,38.0,0,0,Mrs
2,1.0,3,0,26.0,0,1,Miss
3,1.0,1,0,35.0,0,1,Mrs
4,0.0,3,1,35.0,0,1,Mr
...,...,...,...,...,...,...,...
886,0.0,2,1,27.0,0,1,Officer
887,1.0,1,0,19.0,0,1,Miss
888,0.0,3,0,,0,1,Miss
889,1.0,1,1,26.0,0,0,Mr


In [149]:
median_age = train.groupby(['Sex','Pclass','Title']).median()
median_age

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Survived,Age,Embarked_Q,Embarked_S
Sex,Pclass,Title,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1,Miss,1.0,30.0,0.0,0.5
0,1,Mrs,1.0,40.0,0.0,1.0
0,1,Officer,1.0,49.0,0.0,1.0
0,1,Royalty,1.0,40.5,0.0,0.5
0,2,Miss,1.0,24.0,0.0,1.0
0,2,Mrs,1.0,31.5,0.0,1.0
0,3,Miss,0.5,18.0,0.0,1.0
0,3,Mrs,0.5,31.0,0.0,1.0
1,1,Master,1.0,4.0,0.0,1.0
1,1,Mr,0.0,40.0,0.0,1.0


In [150]:
median_age = median_age.reset_index()[['Sex', 'Pclass', 'Title', 'Age']]
median_age

Unnamed: 0,Sex,Pclass,Title,Age
0,0,1,Miss,30.0
1,0,1,Mrs,40.0
2,0,1,Officer,49.0
3,0,1,Royalty,40.5
4,0,2,Miss,24.0
5,0,2,Mrs,31.5
6,0,3,Miss,18.0
7,0,3,Mrs,31.0
8,1,1,Master,4.0
9,1,1,Mr,40.0


In [151]:
def age_from_train(row):
    # Given a row (passenger entry), return a guess of the passenger's age based on his/her sex, pclass, and title, according
    #  to our dataframe.
    
    # Generate boolean arrays
    sex_bool_a = median_age['Sex'] == row['Sex']
    p_bool_a = median_age['Pclass'] == row['Pclass']
    t_bool_a = median_age['Title'] == row['Title']
    
    # Perform bitwise AND operation across the 3 boolean arrays
    bool_arr = sex_bool_a & p_bool_a & t_bool_a 
    return median_age[bool_arr]['Age'].values[0]

We can now apply our function on the combined data, to populate the 'Age' fields where there are none.

In [152]:
# We cannot use 'map' because that is for working within a column (pandas series)
# So should use 'apply' instead, that works for entire dataframes. We also have to specify the axis=1, to indicate we are
#  working down the column-axis, i.e. row-to-row.
data['Age'] = data.apply(lambda row : 
                         age_from_train(row) if pd.isnull(row['Age']) 
                         else row['Age'],
                         axis=1 
                        )

In [153]:
sum(data.Age.isnull())

0

In [154]:
data

Unnamed: 0,Survived,Pclass,Sex,Age,Embarked_Q,Embarked_S,Title
0,0.0,3,1,22.0,0,1,Mr
1,1.0,1,0,38.0,0,0,Mrs
2,1.0,3,0,26.0,0,1,Miss
3,1.0,1,0,35.0,0,1,Mrs
4,0.0,3,1,35.0,0,1,Mr
...,...,...,...,...,...,...,...
413,,3,1,26.0,0,1,Mr
414,,1,0,39.0,0,0,Royalty
415,,3,1,38.5,0,1,Mr
416,,3,1,26.0,0,1,Mr


# Modelling

Now to split our dataset into train and test. And prepare to train.

In [155]:
combined = data.drop(['Title'], axis=1)

In [156]:
train = combined[ data.Survived.notnull() ]

In [157]:
y = train['Survived']
y

0      0.0
1      1.0
2      1.0
3      1.0
4      0.0
      ... 
886    0.0
887    1.0
888    0.0
889    1.0
890    0.0
Name: Survived, Length: 891, dtype: float64

In [158]:
train.drop(['Survived'], inplace=True, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [160]:
train

Unnamed: 0,Pclass,Sex,Age,Embarked_Q,Embarked_S
0,3,1,22.0,0,1
1,1,0,38.0,0,0
2,3,0,26.0,0,1
3,1,0,35.0,0,1
4,3,1,35.0,0,1
...,...,...,...,...,...
886,2,1,27.0,0,1
887,1,0,19.0,0,1
888,3,0,18.0,0,1
889,1,1,26.0,0,0


In [208]:
test = combined[ data.Survived.isnull() ]
test.drop(['Survived'], inplace=True, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [191]:
from sklearn.model_selection import cross_val_score
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

logreg_clf = LogisticRegression(penalty='l2', random_state=0)
knn_clf = KNeighborsClassifier(n_neighbors = 9, metric = 'minkowski', p = 2)
svm_clf = SVC(kernel = 'rbf', random_state = 0)
naivebayes_clf = GaussianNB()
rf_clf = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 0)
gb_clf = GradientBoostingClassifier() # no tuning

classifiers = [logreg_clf, knn_clf, svm_clf, naivebayes_clf, rf_clf, gb_clf]

In [192]:
for clf in classifiers:
    print("Cross-validation of ", clf.__class__)
    xval = cross_val_score(estimator=clf, 
                           X=train, 
                           y=y, 
                           cv = 10, #cv = 5
                           scoring='accuracy')
    print(np.mean(xval), " +/- ", np.std(xval))
    print('*' * 25)

Cross-validation of  <class 'sklearn.linear_model._logistic.LogisticRegression'>
0.7935205992509362  +/-  0.020483878476698046
*************************
Cross-validation of  <class 'sklearn.neighbors._classification.KNeighborsClassifier'>
0.7744319600499375  +/-  0.04143994623436661
*************************
Cross-validation of  <class 'sklearn.svm._classes.SVC'>
0.6341448189762797  +/-  0.025786319659583442
*************************
Cross-validation of  <class 'sklearn.naive_bayes.GaussianNB'>
0.7789138576779026  +/-  0.02057816421564904
*************************
Cross-validation of  <class 'sklearn.ensemble._forest.RandomForestClassifier'>
0.812621722846442  +/-  0.02709294539370844
*************************
Cross-validation of  <class 'sklearn.ensemble._gb.GradientBoostingClassifier'>
0.8260424469413234  +/-  0.02667224197913903
*************************


# Hyperparameters Tuning

First, we focus on Random Forest Classifier. 

Then we revert to an untuned rf classifier. 

Then we do grid of parameters. And perform gridsearch on all permutations of parameters.

In [194]:
untuned_clf = RandomForestClassifier()
print("Cross-validation of ", untuned_clf.__class__)
xval = cross_val_score(estimator=untuned_clf, 
                       X=train, 
                       y=y, 
                       cv = 10, #cv = 5
                       scoring='accuracy')
print(np.mean(xval), " +/- ", np.std(xval))
print('*' * 25)

Cross-validation of  <class 'sklearn.ensemble._forest.RandomForestClassifier'>
0.8103995006242197  +/-  0.03253405662222191
*************************


In [199]:
from sklearn.model_selection import StratifiedKFold # I have no idea what this is! :/
from sklearn.model_selection import GridSearchCV

In [201]:
# Running gridsearch (all permutations of your para_grid) takes up a lot of time.
run_gridsearch = False

if run_gridsearch:
    parameter_grid = {'max_depth' : [4, 6, 8],
                      'n_estimators': [50, 10],
                      'max_features': ['sqrt', 'auto', 'log2'],
                      'min_samples_split': [2, 3, 10],
                      'min_samples_leaf': [1, 3, 10],
                      'bootstrap': [True, False]}
    forest = RandomForestClassifier()
    cross_validation = StratifiedKFold(n_splits=5)

    grid_search = GridSearchCV(forest,
                               scoring='accuracy',
                               param_grid=parameter_grid,
                               cv=cross_validation,
                               verbose=1
                              )

    grid_search.fit(train, y)
    model = grid_search
    parameters = grid_search.best_params_

    print('Best score: {}'.format(grid_search.best_score_))
    print('Best parameters: {}'.format(grid_search.best_params_))

Fitting 5 folds for each of 324 candidates, totalling 1620 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Best score: 0.824913690289373
Best parameters: {'bootstrap': True, 'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 50}


[Parallel(n_jobs=1)]: Done 1620 out of 1620 | elapsed:  1.2min finished


In [202]:
# After running the computationally heavy task, we got the following parameter configuration:
best_parameters: {'bootstrap': True, 
                  'max_depth': 4, 
                  'max_features': 'sqrt', 
                  'min_samples_leaf': 1, 
                  'min_samples_split': 3, 
                  'n_estimators': 50}

Just to be clear, let's recap:

We have determined the "best" configuration of parameters to be fed into our Random Forest Classifier.

We now need to train this specific model with our training data, and fit it into our y.


In [205]:
model = RandomForestClassifier(**parameters)
model.fit(train, y);

In [215]:
output = model.predict(test).astype(int)
df_output = pd.DataFrame()
aux = pd.read_csv('input/test.csv')
df_output['PassengerId'] = aux['PassengerId']
df_output['Survived'] = output

# We ignore the other columns, and only display 2 columns: 'PassengerId' and 'Survived'.
df_output[['PassengerId','Survived']].to_csv('output/gridsearch_rf.csv', index=False)