In [1]:
# importing libraries
import numpy as np
import pandas as pd

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier


In [2]:
# Reading titanic csv:
data = pd.read_csv("titanic.csv")
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# I performed data imputing and column conversion similar to the previous tasks. 
# For the sake of this excersice I decided not to delete rows with missing data but to get the missing values in the Age and Embarked columns by
# using simple imputer and strategy 'mean' and 'most_frequent' respectively. 
from sklearn.impute import SimpleImputer 
impute = SimpleImputer(missing_values=np.nan,strategy="mean")
data[["Age"]] = impute.fit_transform(data[["Age"]])
impute = SimpleImputer(missing_values=np.nan,strategy="most_frequent")
data[["Embarked"]] = impute.fit_transform(data[["Embarked"]])

In [4]:
# Creating a new column 'Sex_num' and changing 'male' to 0 and 'female' to 1, so that we have numerical values for assigning data:
data['Sex_num'] = data["Sex"].replace("male", 0).replace("female", 1)

In [5]:
# Creating a new column 'Embarked_num' and changing 'S' as 0, 'C' to 1 and 'Q' to 2, so that we have numerical values for assigning data: 
data['Embarked_num'] = data["Embarked"].replace("S", 0).replace("C", 1).replace("Q", 2)

In [6]:
# Confirming the new number of columns:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_num,Embarked_num
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,0


In [7]:
# Assigning data: Pclass, sex. age, SibSp, Parch, Fare and Embarked will be assigned to x:
# I decided to include Embarked in my x, because in the previous task (T33) I learnt that the higher survival was noted for people embarking in Cherbourg, compared with Queenstown and then Southampton:
x = data.iloc[:,[2,5,6,7,9,12,13]].values
# Survived outcome will be our y:
y = data.iloc[:,1].values
# Reshaping x data for sklearn:
x = x.reshape(-1,7)

In [8]:
# Splitting data into train and test 70% - 30%:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=7)

In [9]:
# Confirming the split and checking the data shapes:
print ("Training data:",x_train.shape, y_train.shape)
print ("Test data:",x_test.shape, y_test.shape)

Training data: (623, 7) (623,)
Test data: (268, 7) (268,)


In [10]:
# Create base model with the max_depth = 5 (optimised in the previous task) and the three new models: Bagged, Random Forest and Boosted:
base = DecisionTreeClassifier(max_depth=5, random_state=7)
ensemble_bagging = BaggingClassifier(base_estimator=base, n_estimators=100, random_state=7)
ensemble_forest = RandomForestClassifier(max_depth = 5, n_estimators=100, random_state=7)
ensemble_boost = AdaBoostClassifier(base_estimator=base, n_estimators=100, random_state=7)

# Fitting the model:
base.fit(x_train, y_train)
ensemble_bagging.fit(x_train, y_train)
ensemble_forest.fit(x_train, y_train)
ensemble_boost.fit(x_train, y_train)

# Calculating accuracy scores for different models:
print("Accuracy of base:", round(base.score(x_test, y_test),3))
print("Accuracy of ensemble bagged:",round(ensemble_bagging.score(x_test, y_test),3))
print("Accuracy of ensemble forest:", round(ensemble_forest.score(x_test, y_test),3))
print("Accuracy of ensemble boosted:",round(ensemble_boost.score(x_test, y_test),3))

Accuracy of base: 0.772
Accuracy of ensemble bagged: 0.772
Accuracy of ensemble forest: 0.787
Accuracy of ensemble boosted: 0.754


In [11]:
# Checking which features (independent variables) were most important in determining survival looking at random forest feature_importances_ method:
important_features = pd.Series(ensemble_forest.feature_importances_).sort_values(ascending=False)
important_features

5    0.458004
4    0.172670
0    0.127043
1    0.116282
3    0.047407
2    0.045241
6    0.033354
dtype: float64

From the Random Forest model it looks like feature 5 (Sex) is the one that contributes the most to passenger survival, followed by 4 (Fare).

Of all models, looking at accuracy scores, Random Forest model performed the best at max_depth = 5. 
I will now assess the effect of other max_depths (from 2 to 10) on all three models: 

In [12]:
for x in range(2,11):
    max_depth = x
    base = DecisionTreeClassifier(max_depth = max_depth, random_state=7).fit(x_train, y_train)
    ensemble_bagging = BaggingClassifier(base_estimator=base, n_estimators=100, random_state=7).fit(x_train, y_train)
    ensemble_forest = RandomForestClassifier(max_depth = max_depth, n_estimators=100, random_state=7).fit(x_train, y_train)
    ensemble_boost = AdaBoostClassifier(base_estimator=base, n_estimators=100, random_state=7).fit(x_train, y_train)
    print("Accuracies for models with max_depth =", max_depth)
    print("Base:\t", round(base.score(x_test, y_test),3))
    print("Bagged:\t",round(ensemble_bagging.score(x_test, y_test),3))
    print("Forest:\t", round(ensemble_forest.score(x_test, y_test),3))
    print("Boost:\t",round(ensemble_boost.score(x_test, y_test),3))

Accuracies for models with max_depth = 2
Base:	 0.743
Bagged:	 0.746
Forest:	 0.776
Boost:	 0.735
Accuracies for models with max_depth = 3
Base:	 0.765
Bagged:	 0.765
Forest:	 0.784
Boost:	 0.728
Accuracies for models with max_depth = 4
Base:	 0.765
Bagged:	 0.769
Forest:	 0.787
Boost:	 0.776
Accuracies for models with max_depth = 5
Base:	 0.772
Bagged:	 0.772
Forest:	 0.787
Boost:	 0.754
Accuracies for models with max_depth = 6
Base:	 0.761
Bagged:	 0.784
Forest:	 0.78
Boost:	 0.757
Accuracies for models with max_depth = 7
Base:	 0.769
Bagged:	 0.787
Forest:	 0.791
Boost:	 0.731
Accuracies for models with max_depth = 8
Base:	 0.757
Bagged:	 0.784
Forest:	 0.787
Boost:	 0.769
Accuracies for models with max_depth = 9
Base:	 0.743
Bagged:	 0.784
Forest:	 0.795
Boost:	 0.757
Accuracies for models with max_depth = 10
Base:	 0.754
Bagged:	 0.776
Forest:	 0.795
Boost:	 0.739


For max_depths tested (2-10), and costant n_estimators (set to 100), and constant random_state (set to 7), highest accuracies for each models were for the following max_depths:
- Base model = 5
- Bagged model = 7
- Forest model = 9, 10
- Boosted model = 4
It looks like changing max_depth affects each model's accuracy slightly differently. 
The highest accuracy accross the three models was for the Forest model 0.795 (for max_depth 9 and 10).
Because of highest accuracy score in the max_depth analysis, I will use the Random Forest model for further analysis on the n_estimators.

In [13]:
# Assessing n_estimators using Random Forest using max_depth = 9
for x in range(1,250,30):
    estimations = x
    ensemble_forest = RandomForestClassifier(max_depth = 9, n_estimators=estimations, random_state=7).fit(x_train, y_train)
    print(f"Accuracy of Random Forest at {estimations} estimations:\t", round(ensemble_forest.score(x_test, y_test),3))

Accuracy of Random Forest at 1 estimations:	 0.746
Accuracy of Random Forest at 31 estimations:	 0.802
Accuracy of Random Forest at 61 estimations:	 0.791
Accuracy of Random Forest at 91 estimations:	 0.787
Accuracy of Random Forest at 121 estimations:	 0.791
Accuracy of Random Forest at 151 estimations:	 0.795
Accuracy of Random Forest at 181 estimations:	 0.791
Accuracy of Random Forest at 211 estimations:	 0.799
Accuracy of Random Forest at 241 estimations:	 0.795


The highest accuracy of Random Forest model at max_depth=9, was at 31 n_estimators. 

In [14]:
# Recalculating accuracy score for the Random Forest model at max_depth = 9 and n_estimators = 31:
ensemble_forest = RandomForestClassifier(max_depth = 9, n_estimators=31, random_state=7)
ensemble_forest.fit(x_train, y_train)
print("Accuracy of ensemble forest:", round(ensemble_forest.score(x_test, y_test),3))

Accuracy of ensemble forest: 0.802


Accuracies for all the models at max_depth = 5 are calculated at cell [37] . In case of the Forest Model, accuracy score increased (after optimising parameter settings) from 0.787 to 0.802 . Concluding, the best model to predict data selected by me in this task as x and y, is the Random Forest model with max_depth of 9 and 31 n_estimators and accuracy = 0.802. 