In [1]:
# Import pandas and load titanic dataset
import pandas

In [2]:
raw_data = pandas.read_csv('./titanic.csv', index_col='PassengerId')

# Step 1: Study Dataset

### Review features of dataset
Through this cell we will review the total # of rows in the dataset to better understand its size. Then we review the features (columns) of the dataset and then iterate through each feature and print its first 5 rows as well as get the columns type.

In [3]:
length = len(raw_data)
columns = raw_data.columns

print("Num of rows:: " + str(length))
print(raw_data.columns)

for row in columns:
  print(raw_data[row].head())

Num of rows:: 891
Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Cabin', 'Embarked'],
      dtype='object')
PassengerId
1    0
2    1
3    1
4    1
5    0
Name: Survived, dtype: int64
PassengerId
1    3
2    1
3    3
4    1
5    3
Name: Pclass, dtype: int64
PassengerId
1                              Braund, Mr. Owen Harris
2    Cumings, Mrs. John Bradley (Florence Briggs Th...
3                               Heikkinen, Miss. Laina
4         Futrelle, Mrs. Jacques Heath (Lily May Peel)
5                             Allen, Mr. William Henry
Name: Name, dtype: object
PassengerId
1      male
2    female
3    female
4    female
5      male
Name: Sex, dtype: object
PassengerId
1    22.0
2    38.0
3    26.0
4    35.0
5    35.0
Name: Age, dtype: float64
PassengerId
1    1
2    1
3    0
4    1
5    0
Name: SibSp, dtype: int64
PassengerId
1    0
2    0
3    0
4    0
5    0
Name: Parch, dtype: int64
PassengerId
1           A/5 21171
2            PC 

# Step 2: Data Cleansing

### Handling missing data
Though the next few cells we are going to clean up our data with our first step being missing values.

In [4]:
# List out the null or na values in each column
raw_data.isna().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

You can see that `Cabin` has 687 empty values compared to the 891 total potential values. It can be safe to say that the `Cabin` feature is not helpful and can be dropped. We will also save this change to a new dataframe. 

In [5]:
clean_data = raw_data.drop('Cabin', axis=1) # We are setting axis to "1" to represent dropping the column whereas "0" would mean "row"

The next column to review is `Age` which has 177 missing values which is not very significant and the age feature is one that would be important for us to use so we would not want to drop it. In situations like this a common approach would be to take the median of every value in that column and apply that to the missing values.

In [6]:
median_age = clean_data["Age"].median()
clean_data["Age"] = clean_data["Age"].fillna(median_age)

The third column missing data is `Embarked` which has only 2 missing values. The column is made up of strings not integers so an approach would be to group these missing values as one value "U" for unknown.

In [7]:
clean_data["Embarked"] = clean_data["Embarked"].fillna('U')

In [8]:
# Save updated dataframe to a new CSV to not lose progress
clean_data.to_csv('./clean_titanic_data.csv', index=None)

# Step 3: Feature Engineering

### 3.1: Turn categorical data into numerical data with One-hot encoding
In order for our model to perform mathematical operations with our data we have to make sure all of the data is numerical.

**One-hot encoding:** Look at a specific feature and then determine the number of classes within that feature. You will then create a new column (feature) specific to that one class within that feature. In the case of `Sex` you have "male" and "female". So you would add 2 new columns to the dataframe-- `gender_male && gender_female`
- Keep in mind that if you have a feature with hundreds of classes then one-hot encoding will create a hundred new columns, and those columns will be filled with 0's and 1's.

In [9]:
preprocessed_data = pandas.read_csv('./clean_titanic_data.csv')

gender_columns = pandas.get_dummies(preprocessed_data['Sex'], prefix='Sex')
embarked_columns = pandas.get_dummies(
    preprocessed_data['Embarked'], prefix='Embarked')

preprocessed_data = pandas.concat([preprocessed_data, gender_columns], axis=1)
preprocessed_data = pandas.concat([preprocessed_data, embarked_columns], axis=1)

preprocessed_data = preprocessed_data.drop(['Sex', 'Embarked'], axis=1)

In [10]:
preprocessed_data_columns = preprocessed_data.columns
print(preprocessed_data_columns)

Index(['Survived', 'Pclass', 'Name', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare',
       'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
       'Embarked_U'],
      dtype='object')


### one-shot encoding for numerical classes
Looking at the feature `Pclass` we see that there are 3 numbers representing it. 3, 2, 1 representing first class, second class, and third class. In order to determine if we would want to apply one-shot encoding we should see how this feature plays a role in our predicted outcome. We will view this through the following code

In [11]:
class_survived = preprocessed_data[['Pclass', 'Survived']]

first_class = class_survived[class_survived['Pclass'] == 1]
second_class = class_survived[class_survived['Pclass'] == 2]
third_class = class_survived[class_survived['Pclass'] == 3]

print("In first class", sum(first_class['Survived'])/len(first_class)*100, "% of passengers survived")
print("In second class", sum(second_class['Survived'])/len(second_class)*100, "% of passengers survived")
print("In third class", sum(third_class['Survived'])/len(third_class)*100, "% of passengers survived")

In first class 62.96296296296296 % of passengers survived
In second class 47.28260869565217 % of passengers survived
In third class 24.236252545824847 % of passengers survived


We can see that passengers in third class had the lowest chance of survival so in this case it would make sense to leave this feature as a linear feature where the higher the number the less of a chance of survival. For the projects sake we will still apply one-shot encoding and can apply changes in the future to see how the model reacts.

In [12]:
categorized_pclass_columns = pandas.get_dummies(preprocessed_data['Pclass'], prefix='Pclass')
preprocessed_data = pandas.concat([preprocessed_data, categorized_pclass_columns], axis=1)
preprocessed_data = preprocessed_data.drop(['Pclass'], axis=1)

preprocessed_data_columns = preprocessed_data.columns
print(preprocessed_data_columns)

Index(['Survived', 'Name', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare',
       'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
       'Embarked_U', 'Pclass_1', 'Pclass_2', 'Pclass_3'],
      dtype='object')


### Binning: Turning numerical data into categorical data
An example of when we would want to turn numerical data into categorical data would be through `Binning` which is the process of splitting numbers into several different buckets

Age is a good example, in the context of this titanic model we would want to answer the question: how does age effect the passengers chance of survival? In a linear model this could be grouped into two categories-- the older you are the the less likely you are to survive and vice versa. But what if this relationship is not as straightforward? 

The process of binning is similar to one-shot encoding where it will create new columns for each bin.

In [13]:
bins = [0, 10, 20, 30, 40, 50, 60, 70, 80]
categorized_age = pandas.cut(preprocessed_data['Age'], bins)
print(categorized_age)
preprocessed_data['Categorized_age'] = categorized_age
preprocessed_data = preprocessed_data.drop(['Age'], axis=1)

# You will see that this will update `Categorized_age` values into a range of numbers
preprocessed_data_columns = preprocessed_data.columns
print(preprocessed_data_columns)

0      (20, 30]
1      (30, 40]
2      (20, 30]
3      (30, 40]
4      (30, 40]
         ...   
886    (20, 30]
887    (10, 20]
888    (20, 30]
889    (20, 30]
890    (30, 40]
Name: Age, Length: 891, dtype: category
Categories (8, interval[int64, right]): [(0, 10] < (10, 20] < (20, 30] < (30, 40] < (40, 50] < (50, 60] < (60, 70] < (70, 80]]
Index(['Survived', 'Name', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Sex_female',
       'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Embarked_U',
       'Pclass_1', 'Pclass_2', 'Pclass_3', 'Categorized_age'],
      dtype='object')


In [14]:
categorized_aged_columns = pandas.get_dummies(preprocessed_data['Categorized_age'], prefix='Categorized_age')
preprocessed_data = pandas.concat([preprocessed_data, categorized_aged_columns], axis= 1)
preprocessed_data = preprocessed_data.drop(['Categorized_age'], axis=1)

In [15]:
preprocessed_data.head()

Unnamed: 0,Survived,Name,SibSp,Parch,Ticket,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,...,Pclass_2,Pclass_3,"Categorized_age_(0, 10]","Categorized_age_(10, 20]","Categorized_age_(20, 30]","Categorized_age_(30, 40]","Categorized_age_(40, 50]","Categorized_age_(50, 60]","Categorized_age_(60, 70]","Categorized_age_(70, 80]"
0,0,"Braund, Mr. Owen Harris",1,0,A/5 21171,7.25,False,True,False,False,...,False,True,False,False,True,False,False,False,False,False
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,0,PC 17599,71.2833,True,False,True,False,...,False,False,False,False,False,True,False,False,False,False
2,1,"Heikkinen, Miss. Laina",0,0,STON/O2. 3101282,7.925,True,False,False,False,...,False,True,False,False,True,False,False,False,False,False
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,0,113803,53.1,True,False,False,False,...,False,False,False,False,False,True,False,False,False,False
4,0,"Allen, Mr. William Henry",0,0,373450,8.05,False,True,False,False,...,False,True,False,False,False,True,False,False,False,False


## Feature Selection
Determine which columns are unecessary for our model. One of which would be "Name" which is unique for each passenger and would provide the model no insight into how the name relates to the passengers chance of survival, or in general that name doesn't relate for the matter of this dataset. We will drop `Name` `Ticket` and `PassengerId` 

In [16]:
preprocessed_data = preprocessed_data.drop(['Name', 'Ticket'], axis=1)

In [17]:
preprocessed_data.head()

Unnamed: 0,Survived,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_U,...,Pclass_2,Pclass_3,"Categorized_age_(0, 10]","Categorized_age_(10, 20]","Categorized_age_(20, 30]","Categorized_age_(30, 40]","Categorized_age_(40, 50]","Categorized_age_(50, 60]","Categorized_age_(60, 70]","Categorized_age_(70, 80]"
0,0,1,0,7.25,False,True,False,False,True,False,...,False,True,False,False,True,False,False,False,False,False
1,1,1,0,71.2833,True,False,True,False,False,False,...,False,False,False,False,False,True,False,False,False,False
2,1,0,0,7.925,True,False,False,False,True,False,...,False,True,False,False,True,False,False,False,False,False
3,1,1,0,53.1,True,False,False,False,True,False,...,False,False,False,False,False,True,False,False,False,False
4,0,0,0,8.05,False,True,False,False,True,False,...,False,True,False,False,False,True,False,False,False,False


In [18]:
preprocessed_data.to_csv('preprocessed_titanic_data.csv', index=None)

# Model Training

In [19]:
data = pandas.read_csv('preprocessed_titanic_data.csv')

### Splitting data

In [20]:
# Separate the features from the labels
features = data.drop(['Survived'], axis=1)
labels = data['Survived']

In [21]:
# Perform 60/20/20 split 60% Training 20% validation 20% Test
from sklearn.model_selection import train_test_split

# Split training set and validation set, with test_size set to 0.4 we will set our training set to 60 and our test set to 40%
features_train, features_validation_test, labels_train, labels_validation_test = train_test_split(features, labels, test_size=0.4, random_state=100)

# Since we want a validation and test set we will take the validation set created above and split it in half to make it 20% val 20% test
features_validation, features_test, labels_validation, labels_test = train_test_split(features_validation_test, labels_validation_test, test_size=0.5, random_state=100)

print(len(features_train))
print(len(features_validation))
print(len(features_test))
print(len(labels_train))
print(len(labels_validation))
print(len(labels_test))

534
178
179
534
178
179


### Models to train
We will review a Logistic Regression model as well as a decision tree, a naive Bayes model, a support vector machine, a random forest, a gradient boosted tree, and an AdaBoost model

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier


In [23]:
lr_model = LogisticRegression()
lr_model.fit(features_train, labels_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
dt_model = DecisionTreeClassifier()
dt_model.fit(features_train, labels_train)

nb_model = GaussianNB()
nb_model.fit(features_train, labels_train)

svm_model = SVC()
svm_model.fit(features_train, labels_train)

rf_model = RandomForestClassifier()
rf_model.fit(features_train, labels_train)

gb_model = GradientBoostingClassifier()
gb_model.fit(features_train, labels_train)

ab_model = AdaBoostClassifier()
ab_model.fit(features_train, labels_train)



### Evaluate models accuracy
We will evaluate the models:

- accuracy: Ration between correctly labeled points and total number
- recall: TP / (TP + FN) True posititives divided by sum of true positives and false negatives
- precision: TP / (TP + FP) True positives divided by sum of true positiives and false positives
- F1 score: Mean of precision and recall

In [25]:
# Accuracy

print("Scores of the models")
print("Logistic regression:", lr_model.score(
    features_validation, labels_validation))
print("Decision tree:", dt_model.score(features_validation, labels_validation))
print("Naive Bayes:", nb_model.score(features_validation, labels_validation))
print("SVM:", svm_model.score(features_validation, labels_validation))
print("Random forest:", rf_model.score(features_validation, labels_validation))
print("Gradient boosting:", gb_model.score(
    features_validation, labels_validation))
print("AdaBoost:", ab_model.score(features_validation, labels_validation))

Scores of the models
Logistic regression: 0.7696629213483146
Decision tree: 0.7696629213483146
Naive Bayes: 0.7471910112359551
SVM: 0.6797752808988764
Random forest: 0.7752808988764045
Gradient boosting: 0.8089887640449438
AdaBoost: 0.7640449438202247


In [26]:
# F1 score

from sklearn.metrics import f1_score

print("F1-scores of the models:")

lr_predicted_labels = lr_model.predict(features_validation)
print("Logistic regression:", f1_score(labels_validation, lr_predicted_labels))

dt_predicted_labels = dt_model.predict(features_validation)
print("Decision Tree:", f1_score(labels_validation, dt_predicted_labels))

nb_predicted_labels = nb_model.predict(features_validation)
print("Naive Bayes:", f1_score(labels_validation, nb_predicted_labels))

svm_predicted_labels = svm_model.predict(features_validation)
print("Support Vector Machine:", f1_score(
    labels_validation, svm_predicted_labels))

rf_predicted_labels = rf_model.predict(features_validation)
print("Random Forest:", f1_score(labels_validation, rf_predicted_labels))

gb_predicted_labels = gb_model.predict(features_validation)
print("Gradient boosting:", f1_score(labels_validation, gb_predicted_labels))

ab_predicted_labels = ab_model.predict(features_validation)
print("AdaBoost:", f1_score(labels_validation, ab_predicted_labels))

F1-scores of the models:
Logistic regression: 0.6870229007633588
Decision Tree: 0.6917293233082706
Naive Bayes: 0.6808510638297872
Support Vector Machine: 0.4
Random Forest: 0.7014925373134329
Gradient boosting: 0.7384615384615385
AdaBoost: 0.6865671641791045


### Testing the model
We will test gradient boosting model since it scored the highest in F1

In [27]:
gb_model.score(features_test, labels_test)

0.8324022346368715

In [28]:
gb_predicted_test_labels = gb_model.predict(features_test)
f1_score(labels_test, gb_predicted_test_labels)

0.8026315789473685

## Hyperparameter Tuning

In this section we will take our lowest performing model and tune the hyperparameters using **Grid search** to improve its performance.
- Grid Search: training our model several times over different combinations of hyperparameters and selecting the combination that performs the best

For our SVM we will look to the kernel trick by applying the RBF (radial basis function) which will generate an infinite dimentional plane based on the hyperparameter `gamma` 

In [29]:
print("SVM grid search with a radial basis function kernel")

# rbf, C=1, gamma=0.1
svm_1_01 = SVC(kernel='rbf', C=1, gamma=0.1)
svm_1_01.fit(features_train, labels_train)
print("C=1, gamma=0.1", svm_1_01.score(features_validation, labels_validation))

# rbf, C=1, gamma=1
svm_1_1 = SVC(kernel='rbf', C=1, gamma=1)
svm_1_1.fit(features_train, labels_train)
print("C=1, gamma=1", svm_1_1.score(features_validation, labels_validation))

# rbf, C=1, gamma=10
svm_1_10 = SVC(kernel='rbf', C=1, gamma=10)
svm_1_10.fit(features_train, labels_train)
print("C=1, gamma=10", svm_1_10.score(features_validation, labels_validation))

# rbf, C=10, gamma=0.1
svm_10_01 = SVC(kernel='rbf', C=10, gamma=0.1)
svm_10_01.fit(features_train, labels_train)
print("C=10, gamma=0.1", svm_10_01.score(
    features_validation, labels_validation))

# rbf, C=10, gamma=1
svm_10_1 = SVC(kernel='rbf', C=10, gamma=1)
svm_10_1.fit(features_train, labels_train)
print("C=10, gamma=1", svm_10_1.score(features_validation, labels_validation))

# rbf, C=10, gamma=10
svm_10_10 = SVC(kernel='rbf', C=10, gamma=10)
svm_10_10.fit(features_train, labels_train)
print("C=10, gamma=10", svm_10_10.score(
    features_validation, labels_validation))

SVM grid search with a radial basis function kernel
C=1, gamma=0.1 0.702247191011236
C=1, gamma=1 0.6966292134831461
C=1, gamma=10 0.6685393258426966
C=10, gamma=0.1 0.7247191011235955
C=10, gamma=1 0.6910112359550562
C=10, gamma=10 0.651685393258427


Instead of creating an individual model for every iteration of hyperparameter testing we will utilize Scikit-learn and its GridSearchCV object.

In [30]:
from sklearn.model_selection import GridSearchCV

In [31]:
svm_parameters = {'kernel': ['rbf'],
                  'C': [0.01, 0.1, 1, 10, 100],
                  'gamma': [0.01, 0.1, 1, 10, 100]
                  }
svm = SVC()
svm_gs = GridSearchCV(estimator=svm,
                      param_grid=svm_parameters)
svm_gs.fit(features_train, labels_train)

svm_winner = svm_gs.best_estimator_
svm_winner

svm_winner.score(features_validation, labels_validation)

0.7191011235955056

In [32]:
svm_winner

### K-fold cross-validation
K-fold is useful in many situations including situations with a small dataset like this example.

It works by recycling the data as follows:
1. Split the data in to k equal portions
2. Train the model k times, using the union of k-1 of the portion sas the training set and the remaining one as a validation set
3. The final score of that model is teh average of the validation scores from the k steps.


In [33]:
svm_gs.cv_results_

{'mean_fit_time': array([0.00647345, 0.00650959, 0.00674238, 0.00706315, 0.00686255,
        0.00620084, 0.00643125, 0.00676203, 0.0072093 , 0.00701556,
        0.00613918, 0.00668497, 0.00695443, 0.00736432, 0.00729232,
        0.00666924, 0.00728741, 0.00823917, 0.0083632 , 0.00762291,
        0.00812764, 0.01003103, 0.00803967, 0.00832763, 0.00771861]),
 'std_fit_time': array([1.31913757e-04, 1.74909070e-04, 1.08613164e-04, 1.26002809e-04,
        6.94152080e-05, 3.35895819e-05, 6.89926850e-05, 3.77777229e-05,
        1.17167438e-04, 5.71438356e-05, 1.81786661e-04, 2.62703495e-04,
        1.67880069e-04, 8.94557693e-05, 1.80899374e-04, 2.80665209e-04,
        5.63225684e-04, 5.22787799e-04, 3.18457166e-04, 8.99234983e-05,
        6.87000071e-04, 8.26582317e-04, 2.73090490e-04, 3.91134834e-04,
        2.18889156e-04]),
 'mean_score_time': array([0.00370164, 0.00356545, 0.00354466, 0.00358715, 0.00358539,
        0.00333133, 0.00337515, 0.00350704, 0.00364423, 0.0036602 ,
        0.00

In [45]:
#Predict
"""
pclass =	Ticket class	1 = 1st, 2 = 2nd, 3 = 3rd
sex =	Sex	
Age =	Age in years	
sibsp =	# of siblings / spouses aboard the Titanic	
parch =	# of parents / children aboard the Titanic	
fare =	Passenger fare	
embarked	Port of Embarkation	C = Cherbourg, Q = Queenstown, S = Southampton
"""
pclass_1 = False
pclass_2 = False
pclass_3 = True
sex_female = False
sex_male = True 
parch =	2
sibsp = 2
fare =	5
embarked_c = True
embarked_q = False
embarked_s = False
embarked_u = False

Newdata = {"SibSp": [sibsp], "Parch": [parch], "Fare": [fare], "Sex_female": [sex_female], "Sex_male": [sex_male], "Embarked_C": [embarked_c], "Embarked_Q": [embarked_q], "Embarked_S": [embarked_s], "Embarked_U": [embarked_u], "Pclass_1": [pclass_1],"Pclass_2": [pclass_2],"Pclass_3": [pclass_3],"Categorized_age_(0, 10]": [False],"Categorized_age_(10, 20]": [False],"Categorized_age_(20, 30]": [True],"Categorized_age_(30, 40]": [False],"Categorized_age_(40, 50]": [False],"Categorized_age_(50, 60]": [False],"Categorized_age_(60, 70]": [False],"Categorized_age_(70, 80]": [False]}

xnew = pandas.DataFrame.from_dict(Newdata)

prediction = svm_gs.predict(xnew)

survived = "True" if prediction == 1 else "False"
print("Survived::" + survived)


Survived::False
