# Pycaret model Titanic - Wouter Selis

## Install needed libraries to use pycaret.

In [1]:
#!pip install pandas
#!pip install numpy
#!pip install pycaret
#!pip install openpyxl

## Loading the data

In [2]:
import pandas as pd
import numpy as np
import category_encoders as ce

# Load a sheet into a DataFrame by its name
df = pd.read_excel('files/titanic3.xlsx')

# We are going to change these outliers to the mean price that has been paid by the other passengers, we could do this by simply changing
# the fare price of these passengers but let's use the technique that would be used when there are more than a few outliers
# We use the outlier detection and removal technique

# Calculate the IQR (InterQuartile Range) for the fare column
Q1 = df['fare'].quantile(0.25)
Q3 = df['fare'].quantile(0.75)
IQR = Q3 - Q1

# Define lower and upper bound for outliers, (sidenote, normally the multiplier used to calculate the lower and upper bound is around 1.5
# but this would cause the identification of normal data as outliers resulting in a lot of good data to be lost because it is flagged as an
# outlier. This is why we use such a high multiplier value.
lower_bound = Q1 - 10 * IQR
upper_bound = Q3 + 10 * IQR

# Filter the data to exclude outliers
df = df[(df['fare'] >= lower_bound) & (df['fare'] <= upper_bound)]

df['firstname']=df['name'].str.split(r'[,.]', expand=True)[2]
df['title']=df['name'].str.split(r'[,.]', expand=True)[1]
df['lastname']=df['name'].str.split(r'[,.]', expand=True)[0]

df.drop('name', axis = 1, inplace = True)

normalized_titles = {
    "Capt":       "Officer",
    "Col":        "Officer",
    "Major":      "Officer",
    "Jonkheer":   "Royal",
    "Don":        "Royal",
    "Sir" :       "Royal",
    "Dr":         "Officer",
    "Rev":        "Officer",
    "the Countess":"Royal",
    "Dona":       "Royal",
    "Mme":        "Mrs",
    "Mlle":       "Miss",
    "Ms":         "Mrs",
    "Mr" :        "Mr",
    "Mrs" :       "Mrs",
    "Miss" :      "Miss",
    "Master" :    "Master",
    "Lady" :      "Royal"
}
# Strip leading and trailing spaces from the 'title' column
df['title'] = df['title'].str.strip()

# Now, apply the mapping to change original values to new values
df['title'] = df['title'].map(normalized_titles)

# Calculate the mean age for non-null values
mean_age = df['age'].mean()

# Calculate the standard deviation of the age column, which will be used to generate random but believable age values
std_age = df['age'].std()

# Create a mask to identify rows with "Master" or "Miss" in the "title" column
master_miss_mask = (df['title'] == 'Master') | (df['title'] == 'Miss')

# Generate random values for rows with "Master" or "Miss" based on a different standard deviation
random_values_master_miss = np.random.normal(loc=0, scale=std_age * 0.5, size=master_miss_mask.sum())

# Shift the distribution to have the same mean as the original data
added_values_master_miss = random_values_master_miss + mean_age

# Update the 'age' column for rows with "Master" or "Miss" individually
master_miss_indices = df.index[master_miss_mask]
for i, index in enumerate(master_miss_indices):
    # Ensure that the age does not exceed 18
    age = min(added_values_master_miss[i], 18)
    df.loc[index, 'age'] = age

# For all other missing values, use the previously calculated random values
random_values = np.random.normal(loc=0, scale=std_age, size=df['age'].isna().sum())
added_values = random_values + mean_age

# Update the 'age' column for all other missing values individually
other_indices = df.index[~master_miss_mask & df['age'].isna()]
for i, index in enumerate(other_indices):
    df.loc[index, 'age'] = added_values[i]

# Change the datatype of the age column from float to int
df['age'] = df['age'].astype(int)


df['cabin'].fillna(0, inplace=True)

# Replace non-null values with 1 without having problems because there are non-numerical values
df['cabin'] = df['cabin'].apply(lambda x: 1 if x != 0 else x)

# There are 2 null values in the embarked column, because it is such a small amount of data we simply change it to the value 'Q'
# which stands for Queenstown
df['embarked'] = df['embarked'].replace(np.nan, 'Q')


df['boat'].fillna(0, inplace=True)

# Replace non-null values with 1 without having problems because there are non-numerical values
df['boat'] = df['boat'].apply(lambda x: 1 if x != 0 else x)


df['body'].fillna(0, inplace=True)

# Replace non-null values with 1 without having problems because there are non-numerical values
df['body'] = df['body'].apply(lambda x: 1 if x != 0 else x)
# We change the datatype from float to int
df['body'] = df['body'].astype(int)

df.drop('home.dest', axis = 1, inplace = True)

df['survived'] = df['survived'].astype(bool)
df['boat'] = df['boat'].astype(bool)
df['body'] = df['body'].astype(bool)
df['embarked'] = str(df['embarked'])

encoder = ce.OrdinalEncoder(cols=['sex'])
df_encoded = encoder.fit_transform(df)


## Import module

In [3]:
from pycaret.classification import *

## Setup the Pycaret environment
Initialize the training environment and create the transformation pipeline to prepare the data for modeling and deployment. Target is set on survived because this is what we want to predict.

In [4]:
s = setup(df_encoded, target = 'survived', session_id = 123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,survived
2,Target type,Binary
3,Original data shape,"(1304, 15)"
4,Transformed data shape,"(1304, 20)"
5,Transformed train set shape,"(912, 20)"
6,Transformed test set shape,"(392, 20)"
7,Numeric features,7
8,Categorical features,5
9,Preprocess,True


## Compare the models
Here we compare the outcomes of all the models. Every highest score is marked in yellow. We can see that Logistic Regression scores overall the best.

In [5]:
# functional API
best = compare_models()
print(best)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.9791,0.9939,0.9566,0.9884,0.9719,0.9554,0.9561,0.448
ridge,Ridge Classifier,0.9791,0.0,0.9566,0.9884,0.9719,0.9554,0.9561,0.034
lda,Linear Discriminant Analysis,0.9375,0.9746,0.8703,0.9635,0.9134,0.8647,0.8688,0.025
nb,Naive Bayes,0.9342,0.9805,0.9653,0.8779,0.9183,0.8636,0.8679,0.025
et,Extra Trees Classifier,0.9189,0.9789,0.839,0.9449,0.8867,0.824,0.8296,0.058
rf,Random Forest Classifier,0.8311,0.9404,0.6744,0.8533,0.7503,0.626,0.6381,0.059
svm,SVM - Linear Kernel,0.821,0.0,0.8447,0.8159,0.7908,0.6496,0.6781,0.026
knn,K Neighbors Classifier,0.773,0.8432,0.6398,0.7345,0.6812,0.5067,0.5118,0.234
dt,Decision Tree Classifier,0.715,0.6567,0.4143,0.6946,0.5046,0.3333,0.3589,0.025
ada,Ada Boost Classifier,0.704,0.7917,0.3742,0.7145,0.4763,0.3042,0.3413,0.041


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=123, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


## Test the best model
Now we want to test the best model (logistic regression) on our dataset.
In the column 'prediction_label' we can see if our model predicted if the person survived or not. 1 = survived & 0 = dead.
In the column 'prediction_score' we can see how sure our model thinks it has predicted the right outcome.

In [6]:
predict_model(best, data=df_encoded)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.9755,0.997,0.9536,0.9813,0.9673,0.9477,0.9479


Unnamed: 0,pclass,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,firstname,title,lastname,survived,prediction_label,prediction_score
0,1,1,18,0,0,24160,211.337494,1,0 S\n1 S\n2 S\n3 S\n4 ...,True,False,Elisabeth Walton,Miss,Allen,True,1,0.9946
1,1,2,18,1,2,113781,151.550003,1,0 S\n1 S\n2 S\n3 S\n4 ...,True,False,Hudson Trevor,Master,Allison,True,1,0.9645
2,1,1,18,1,2,113781,151.550003,1,0 S\n1 S\n2 S\n3 S\n4 ...,False,False,Helen Loraine,Miss,Allison,False,0,0.8568
3,1,2,30,1,2,113781,151.550003,1,0 S\n1 S\n2 S\n3 S\n4 ...,False,True,Hudson Joshua Creighton,Mr,Allison,False,0,0.9934
4,1,1,25,1,2,113781,151.550003,1,0 S\n1 S\n2 S\n3 S\n4 ...,False,False,Hudson J C (Bessie Waldo Daniels),Mrs,Allison,False,0,0.7319
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,1,18,1,0,2665,14.454200,0,0 S\n1 S\n2 S\n3 S\n4 ...,False,True,Hileni,Miss,Zabour,False,0,0.9572
1305,3,1,18,1,0,2665,14.454200,0,0 S\n1 S\n2 S\n3 S\n4 ...,False,False,Thamine,Miss,Zabour,False,0,0.8863
1306,3,2,26,0,0,2656,7.225000,0,0 S\n1 S\n2 S\n3 S\n4 ...,False,True,Mapriededer,Mr,Zakarian,False,0,0.9948
1307,3,2,27,0,0,2670,7.225000,0,0 S\n1 S\n2 S\n3 S\n4 ...,False,False,Ortin,Mr,Zakarian,False,0,0.9835


Get the accuracy to compare it later on.

In [7]:
measures = pull()

## Save model
Finally we can save the model in our files folder.

In [8]:
save_model(best, 'files/titanic_pycaret_model')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['pclass', 'sex', 'age', 'sibsp',
                                              'parch', 'fare', 'cabin'],
                                     transformer=SimpleImputer(add_indicator=False,
                                                               copy=True,
                                                               fill_value=None,
                                                               keep_empty_features=False,
                                                               missing_values=nan,
                                                               strategy='mean',
                                                               verbose='deprecated'))),
                 ('categorical_imputer',
                  Transforme...
                                                               han

## Load model
If I want to reuse this model I can simply load it from my files folder

In [9]:
loaded_model = load_model('files/titanic_pycaret_model')
print(loaded_model)

Transformation Pipeline and Model Successfully Loaded
Pipeline(memory=FastMemory(location=C:\Users\woute\AppData\Local\Temp\joblib),
         steps=[('numerical_imputer',
                 TransformerWrapper(exclude=None,
                                    include=['pclass', 'sex', 'age', 'sibsp',
                                             'parch', 'fare', 'cabin'],
                                    transformer=SimpleImputer(add_indicator=False,
                                                              copy=True,
                                                              fill_value=None,
                                                              keep_empty_features=False,
                                                              missing_values=nan,
                                                              strategy='mean',
                                                              verbose='deprecat...
                                                             

## Without column 'boat' & 'body'
In previous model we used the dataset with feature boat and body included. Obviously when someone was on a boat their survival chance is greater then someone who was in the cold water. Also when they found the body of a person, this person obviously died. This means that when we include these 2 features in our dataset the model will almost only look at those 2 features to make predictions. To see the differnce in performance we will now create a model without these 2 features.

Here we are going to drop our column boat and body from our dataset.

In [10]:
df_exclude = df_encoded
df_exclude = df_exclude.drop(['boat', 'body'], axis=1)
df_exclude

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,ticket,fare,cabin,embarked,firstname,title,lastname
0,1,True,1,18,0,0,24160,211.3375,1,0 S\n1 S\n2 S\n3 S\n4 ...,Elisabeth Walton,Miss,Allen
1,1,True,2,18,1,2,113781,151.5500,1,0 S\n1 S\n2 S\n3 S\n4 ...,Hudson Trevor,Master,Allison
2,1,False,1,18,1,2,113781,151.5500,1,0 S\n1 S\n2 S\n3 S\n4 ...,Helen Loraine,Miss,Allison
3,1,False,2,30,1,2,113781,151.5500,1,0 S\n1 S\n2 S\n3 S\n4 ...,Hudson Joshua Creighton,Mr,Allison
4,1,False,1,25,1,2,113781,151.5500,1,0 S\n1 S\n2 S\n3 S\n4 ...,Hudson J C (Bessie Waldo Daniels),Mrs,Allison
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,False,1,18,1,0,2665,14.4542,0,0 S\n1 S\n2 S\n3 S\n4 ...,Hileni,Miss,Zabour
1305,3,False,1,18,1,0,2665,14.4542,0,0 S\n1 S\n2 S\n3 S\n4 ...,Thamine,Miss,Zabour
1306,3,False,2,26,0,0,2656,7.2250,0,0 S\n1 S\n2 S\n3 S\n4 ...,Mapriededer,Mr,Zakarian
1307,3,False,2,27,0,0,2670,7.2250,0,0 S\n1 S\n2 S\n3 S\n4 ...,Ortin,Mr,Zakarian


## Create and test model

In [11]:
s_exclude = setup(df_exclude, target = 'survived', session_id = 123)

# functional API
best_exclude = compare_models()
print(best_exclude)

predict_model(best_exclude, data=df_exclude)

measures_exclude = pull()

save_model(best_exclude, 'files/titanic_exluded_pycaret_model')

loaded_exclude_model = load_model('files/titanic_exluded_pycaret_model')
print(loaded_exclude_model)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,survived
2,Target type,Binary
3,Original data shape,"(1304, 13)"
4,Transformed data shape,"(1304, 18)"
5,Transformed train set shape,"(912, 18)"
6,Transformed test set shape,"(392, 18)"
7,Numeric features,7
8,Categorical features,5
9,Preprocess,True


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.8202,0.8695,0.6917,0.8125,0.7452,0.6078,0.6143,0.041
nb,Naive Bayes,0.7434,0.8002,0.5396,0.7221,0.613,0.4284,0.4414,0.026
ridge,Ridge Classifier,0.7226,0.0,0.4124,0.7532,0.5302,0.3564,0.3906,0.031
et,Extra Trees Classifier,0.7182,0.7843,0.479,0.6945,0.5576,0.3641,0.3821,0.058
rf,Random Forest Classifier,0.7061,0.7791,0.4005,0.7036,0.5077,0.3205,0.3478,0.069
knn,K Neighbors Classifier,0.704,0.7137,0.513,0.6407,0.567,0.3471,0.3537,0.041
svm,SVM - Linear Kernel,0.6852,0.0,0.5312,0.5296,0.4957,0.3076,0.3342,0.027
gbc,Gradient Boosting Classifier,0.6809,0.6137,0.3028,0.6909,0.416,0.2415,0.2822,0.043
lda,Linear Discriminant Analysis,0.6788,0.721,0.2767,0.691,0.3904,0.2275,0.2714,0.029
dt,Decision Tree Classifier,0.6787,0.6112,0.3286,0.6632,0.4357,0.2459,0.2773,0.027


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=123, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.9202,0.9726,0.8387,0.9455,0.8889,0.827,0.8306


Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Loaded
Pipeline(memory=FastMemory(location=C:\Users\woute\AppData\Local\Temp\joblib),
         steps=[('numerical_imputer',
                 TransformerWrapper(exclude=None,
                                    include=['pclass', 'sex', 'age', 'sibsp',
                                             'parch', 'fare', 'cabin'],
                                    transformer=SimpleImputer(add_indicator=False,
                                                              copy=True,
                                                              fill_value=None,
                                                              keep_empty_features=False,
                                                              missing_values=nan,
                                                              strategy='mean',
                                                              verbose='deprecat...
        

## Lets compare the 2 models.
We can see that all the measures of the model with the boat & body columns included in the dataset scored a little better then the model without them. We thought that boat & body would make a big difference to predict if a preson survived or not. Using this model it doesn't really make a big difference. When we see at Kieran's model using GradientBoostingRegressor we see that there is a big difference though. 

Accuracy all columns:

In [12]:
measures

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.9755,0.997,0.9536,0.9813,0.9673,0.9477,0.9479


Accuracy without column 'boat' & 'body':

In [13]:
measures_exclude

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.9202,0.9726,0.8387,0.9455,0.8889,0.827,0.8306
