# Comparison of models Titanic

In [1]:
import pandas as pd
import numpy as np
import category_encoders as ce

# Load a sheet into a DataFrame by its name
df = pd.read_excel('files/titanic3.xlsx')

# We are going to change these outliers to the mean price that has been paid by the other passengers, we could do this by simply changing
# the fare price of these passengers but let's use the technique that would be used when there are more than a few outliers
# We use the outlier detection and removal technique

# Calculate the IQR (InterQuartile Range) for the fare column
Q1 = df['fare'].quantile(0.25)
Q3 = df['fare'].quantile(0.75)
IQR = Q3 - Q1

# Define lower and upper bound for outliers, (sidenote, normally the multiplier used to calculate the lower and upper bound is around 1.5
# but this would cause the identification of normal data as outliers resulting in a lot of good data to be lost because it is flagged as an
# outlier. This is why we use such a high multiplier value.
lower_bound = Q1 - 10 * IQR
upper_bound = Q3 + 10 * IQR

# Filter the data to exclude outliers
df = df[(df['fare'] >= lower_bound) & (df['fare'] <= upper_bound)]

df['firstname']=df['name'].str.split(r'[,.]', expand=True)[2]
df['title']=df['name'].str.split(r'[,.]', expand=True)[1]
df['lastname']=df['name'].str.split(r'[,.]', expand=True)[0]

df.drop('name', axis = 1, inplace = True)

normalized_titles = {
    "Capt":       "Officer",
    "Col":        "Officer",
    "Major":      "Officer",
    "Jonkheer":   "Royal",
    "Don":        "Royal",
    "Sir" :       "Royal",
    "Dr":         "Officer",
    "Rev":        "Officer",
    "the Countess":"Royal",
    "Dona":       "Royal",
    "Mme":        "Mrs",
    "Mlle":       "Miss",
    "Ms":         "Mrs",
    "Mr" :        "Mr",
    "Mrs" :       "Mrs",
    "Miss" :      "Miss",
    "Master" :    "Master",
    "Lady" :      "Royal"
}
# Strip leading and trailing spaces from the 'title' column
df['title'] = df['title'].str.strip()

# Now, apply the mapping to change original values to new values
df['title'] = df['title'].map(normalized_titles)

# Calculate the mean age for non-null values
mean_age = df['age'].mean()

# Calculate the standard deviation of the age column, which will be used to generate random but believable age values
std_age = df['age'].std()

# Create a mask to identify rows with "Master" or "Miss" in the "title" column
master_miss_mask = (df['title'] == 'Master') | (df['title'] == 'Miss')

# Generate random values for rows with "Master" or "Miss" based on a different standard deviation
random_values_master_miss = np.random.normal(loc=0, scale=std_age * 0.5, size=master_miss_mask.sum())

# Shift the distribution to have the same mean as the original data
added_values_master_miss = random_values_master_miss + mean_age

# Update the 'age' column for rows with "Master" or "Miss" individually
master_miss_indices = df.index[master_miss_mask]
for i, index in enumerate(master_miss_indices):
    # Ensure that the age does not exceed 18
    age = min(added_values_master_miss[i], 18)
    df.loc[index, 'age'] = age

# For all other missing values, use the previously calculated random values
random_values = np.random.normal(loc=0, scale=std_age, size=df['age'].isna().sum())
added_values = random_values + mean_age

# Update the 'age' column for all other missing values individually
other_indices = df.index[~master_miss_mask & df['age'].isna()]
for i, index in enumerate(other_indices):
    df.loc[index, 'age'] = added_values[i]

# Change the datatype of the age column from float to int
df['age'] = df['age'].astype(int)


df['cabin'].fillna(0, inplace=True)

# Replace non-null values with 1 without having problems because there are non-numerical values
df['cabin'] = df['cabin'].apply(lambda x: 1 if x != 0 else x)

# There are 2 null values in the embarked column, because it is such a small amount of data we simply change it to the value 'Q'
# which stands for Queenstown
df['embarked'] = df['embarked'].replace(np.nan, 'Q')


df['boat'].fillna(0, inplace=True)

# Replace non-null values with 1 without having problems because there are non-numerical values
df['boat'] = df['boat'].apply(lambda x: 1 if x != 0 else x)


df['body'].fillna(0, inplace=True)

# Replace non-null values with 1 without having problems because there are non-numerical values
df['body'] = df['body'].apply(lambda x: 1 if x != 0 else x)
# We change the datatype from float to int
df['body'] = df['body'].astype(int)

df.drop('home.dest', axis = 1, inplace = True)

df['survived'] = df['survived'].astype(bool)
df['boat'] = df['boat'].astype(bool)
df['body'] = df['body'].astype(bool)
df['embarked'] = str(df['embarked'])

encoder = ce.OrdinalEncoder(cols=['sex'])
df_encoded = encoder.fit_transform(df)

## Wouter model Pycaret Logistic Regression

In [10]:
from pycaret.classification import *
s = setup(df_encoded, target = 'survived', session_id = 123)
loaded_Pycaret_model = load_model('files/titanic_pycaret_model')
predict_model(loaded_Pycaret_model, data=df_encoded)
measuresLogisticRegression = pull()
measuresLogisticRegression

Unnamed: 0,Description,Value
0,Session id,123
1,Target,survived
2,Target type,Binary
3,Original data shape,"(1304, 15)"
4,Transformed data shape,"(1304, 20)"
5,Transformed train set shape,"(912, 20)"
6,Transformed test set shape,"(392, 20)"
7,Numeric features,7
8,Categorical features,5
9,Preprocess,True


Transformation Pipeline and Model Successfully Loaded


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.9755,0.997,0.9536,0.9813,0.9673,0.9477,0.9479


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.9755,0.997,0.9536,0.9813,0.9673,0.9477,0.9479


In [13]:
s = setup(df_encoded, target = 'survived', session_id = 123)


loaded_Pycaret_model_excluded = load_model('files/titanic_exluded_pycaret_model')
predict_model(loaded_Pycaret_model_excluded, data=df_encoded)
measuresLogisticRegression_excluded = pull()
measuresLogisticRegression_excluded

Unnamed: 0,Description,Value
0,Session id,123
1,Target,survived
2,Target type,Binary
3,Original data shape,"(1304, 15)"
4,Transformed data shape,"(1304, 20)"
5,Transformed train set shape,"(912, 20)"
6,Transformed test set shape,"(392, 20)"
7,Numeric features,7
8,Categorical features,5
9,Preprocess,True


Transformation Pipeline and Model Successfully Loaded


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.9218,0.9727,0.8407,0.9477,0.891,0.8304,0.834


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.9218,0.9727,0.8407,0.9477,0.891,0.8304,0.834


## Kieran model GradientBoostingRegressor

In [16]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
features = ['pclass','sex','age','sibsp','parch','fare','cabin','boat','body','ticket', 'embarked','firstname','title','lastname']
x = df_encoded[features]
y = df_encoded['survived']

X = pd.get_dummies(x, columns=['ticket', 'embarked','firstname','title','lastname'], drop_first=True)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [18]:
features_no_boat = ['pclass','sex','age','sibsp','parch','fare','cabin','ticket', 'embarked','firstname','title','lastname']
x_no_boat = df_encoded[features_no_boat]
y_no_boat = df_encoded['survived']

X_no_boat = pd.get_dummies(x_no_boat, columns=['ticket', 'embarked','firstname','title','lastname'], drop_first=True)

#Split into training and test set
x_train_no_boat, x_test_no_boat, y_train_no_boat, y_test_no_boat = train_test_split(X_no_boat, y_no_boat, test_size=0.2, random_state=0)

In [14]:
import pickle
# Load the model from disk
filename = 'files/titanic_chosen_model_no_boat.sav'
GradientBoostRegressor_model_no_boat = pickle.load(open(filename, 'rb'))
GradientBoostRegressor_model_no_boat


In [15]:
# Load the model from disk
filename = 'files/titanic_chosen_model.sav'
GradientBoostRegressor_model = pickle.load(open(filename, 'rb'))
GradientBoostRegressor_model

In [19]:
y_pred_prob = GradientBoostRegressor_model.predict(x_test)
y_pred_prob_no_boat = GradientBoostRegressor_model_no_boat.predict(x_test_no_boat)

threshold = 0.5
y_pred = (y_pred_prob > threshold).astype(int)
y_pred_no_boat = (y_pred_prob_no_boat > threshold).astype(int)

## Laurens model AWS

# Comparison

In [20]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

## Accuracy
Accuracy is used to measure how well a model correctly predicts the class labels of the instances in a dataset.
Here we can see that the GradienBoostRegressor scores better with column boat & body included but scores less when they are exluded.
Probably the reason why Pycaret couldn't find a model that scores better than the GradientBoostRegressor is because it has a limited list of models. So it is pretty cool Kieran found a model that beats Pycaret.


In [23]:

print("Wouter model Pycaret LogisticRegression: ", measuresLogisticRegression["Accuracy"])
print("Wouter model Pycaret LogisticRegression no boat: ", measuresLogisticRegression_excluded["Accuracy"])
print()
GradientBoostRegressor_Accuracy = accuracy_score(y_test, y_pred)
GradientBoostRegressor_Accuracy_no_boat = accuracy_score(y_test_no_boat, y_pred_no_boat)

print("Kieran model GradientBoostRegressor: ", GradientBoostRegressor_Accuracy)
print("Kieran model GradientBoostRegressor no boat: ", GradientBoostRegressor_Accuracy_no_boat)

Wouter model Pycaret LogisticRegression:  0    0.9755
Name: Accuracy, dtype: float64
Wouter model Pycaret LogisticRegression no boat:  0    0.9218
Name: Accuracy, dtype: float64

Kieran model GradientBoostRegressor:  0.9846743295019157
Kieran model GradientBoostRegressor no boat:  0.8314176245210728


## Precision
Precision is used to measure the accuracy of positive predictions made by a model. It answers the question, out of all instances predicted as positive, how many were correctly predicted?


In [26]:
print("Wouter model Pycaret LogisticRegression: ", measuresLogisticRegression["Prec."])
print("Wouter model Pycaret LogisticRegression no boat: ", measuresLogisticRegression_excluded["Prec."])
print()

GradientBoostRegressor_precision = precision_score(y_test, y_pred)
GradientBoostRegressor_precision_no_boat = precision_score(y_test_no_boat, y_pred_no_boat)

print("Kieran model GradientBoostRegressor: ",GradientBoostRegressor_precision)
print("Kieran model GradientBoostRegressor no boat: ", GradientBoostRegressor_precision_no_boat)

Wouter model Pycaret LogisticRegression:  0    0.9813
Name: Prec., dtype: float64
Wouter model Pycaret LogisticRegression no boat:  0    0.9477
Name: Prec., dtype: float64

Kieran model GradientBoostRegressor:  0.9891304347826086
Kieran model GradientBoostRegressor no boat:  0.8205128205128205


## Recall
Recall which is also known as Sensitivity or True Positive Rate is used to measure a model's ability to correctly  identify all positive instances within a dataset. It answers the question, out of all ACTUAL positive instances, how many were correctly predicted as positive by the model?


In [27]:
print("Wouter model Pycaret LogisticRegression: ", measuresLogisticRegression["Recall"])
print("Wouter model Pycaret LogisticRegression no boat: ", measuresLogisticRegression_excluded["Recall"])
print()

GradientBoostRegressor_recall = recall_score(y_test, y_pred)
GradientBoostRegressor_recall_no_boat = recall_score(y_test_no_boat, y_pred_no_boat)

print("Kieran model GradientBoostRegressor: ",GradientBoostRegressor_recall)
print("Kieran model GradientBoostRegressor no boat: ", GradientBoostRegressor_recall_no_boat)

Wouter model Pycaret LogisticRegression:  0    0.9536
Name: Recall, dtype: float64
Wouter model Pycaret LogisticRegression no boat:  0    0.8407
Name: Recall, dtype: float64

Kieran model GradientBoostRegressor:  0.9680851063829787
Kieran model GradientBoostRegressor no boat:  0.6808510638297872


## ROC AUC
ROC AUC is used to measures how well the model can distinguish between positive and negative. A higher ROC AUC score would mean that the model is better at this and lower means it is bad. A score of 0.5 would mean truly random and would mean that the model is no better than random guessing and with 1.0 it is perfect at distinguishing between the classes.

In [28]:
print("Wouter model Pycaret LogisticRegression: ", measuresLogisticRegression["AUC"])
print("Wouter model Pycaret LogisticRegression no boat: ", measuresLogisticRegression_excluded["AUC"])
print()

GradientBoostRegressor_roc_auc = roc_auc_score(y_test, y_pred)
GradientBoostRegressor_roc_auc_no_boat = roc_auc_score(y_test_no_boat, y_pred_no_boat)

print("Kieran model GradientBoostRegressor: ",GradientBoostRegressor_roc_auc)
print("Kieran model GradientBoostRegressor no boat: ", GradientBoostRegressor_roc_auc_no_boat)

Wouter model Pycaret LogisticRegression:  0    0.997
Name: AUC, dtype: float64
Wouter model Pycaret LogisticRegression no boat:  0    0.9727
Name: AUC, dtype: float64

Kieran model GradientBoostRegressor:  0.9810485412154414
Kieran model GradientBoostRegressor no boat:  0.798509364250223


# Conlusion

We see that the Pycaret LogisticRegression model scores less then GradientBoostRegressor model on the trained model with boat and body columns. Except on the last metric ROC AUC the Pycaret LogisticRegression model scores better. This means that in general the GradientBoostRegressor model is better at differentiating between the four different classes TP, FP, TN, FN. When it comes to differentiating between just positive and negative (ROC AUC) we see that the Pycaret LogisticRegressor model is better.


When we look at the models excluding the columns boat and body, we see that the Pycaret LogisticRegression model scores better on all the metrics. When we look at the GradientBoostRegressor model not using the boat and body column we see a clear drop in perfomance. This means that the GradientBoostRegressor model depends on the columns boat and body and therefore drops in performance when exlcuding these columns. Conversely the Pycaret LogisticRegression model remains kind of the same when dropping these two columns. 