# Loading the data

In [36]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import category_encoders as ce
import missingno

# Load a sheet into a DataFrame by its name
df = pd.read_excel('files/titanic3.xlsx')

# We are going to change these outliers to the mean price that has been paid by the other passengers, we could do this by simply changing
# the fare price of these passengers but let's use the technique that would be used when there are more than a few outliers
# We use the outlier detection and removal technique

# Calculate the IQR (InterQuartile Range) for the fare column
Q1 = df['fare'].quantile(0.25)
Q3 = df['fare'].quantile(0.75)
IQR = Q3 - Q1

# Define lower and upper bound for outliers, (sidenote, normally the multiplier used to calculate the lower and upper bound is around 1.5
# but this would cause the identification of normal data as outliers resulting in a lot of good data to be lost because it is flagged as an
# outlier. This is why we use such a high multiplier value.
lower_bound = Q1 - 10 * IQR
upper_bound = Q3 + 10 * IQR

# Filter the data to exclude outliers
df = df[(df['fare'] >= lower_bound) & (df['fare'] <= upper_bound)]

df['firstname']=df['name'].str.split(r'[,.]', expand=True)[2]
df['title']=df['name'].str.split(r'[,.]', expand=True)[1]
df['lastname']=df['name'].str.split(r'[,.]', expand=True)[0]

df.drop('name', axis = 1, inplace = True)

normalized_titles = {
    "Capt":       "Officer",
    "Col":        "Officer",
    "Major":      "Officer",
    "Jonkheer":   "Royal",
    "Don":        "Royal",
    "Sir" :       "Royal",
    "Dr":         "Officer",
    "Rev":        "Officer",
    "the Countess":"Royal",
    "Dona":       "Royal",
    "Mme":        "Mrs",
    "Mlle":       "Miss",
    "Ms":         "Mrs",
    "Mr" :        "Mr",
    "Mrs" :       "Mrs",
    "Miss" :      "Miss",
    "Master" :    "Master",
    "Lady" :      "Royal"
}
# Strip leading and trailing spaces from the 'title' column
df['title'] = df['title'].str.strip()

# Now, apply the mapping to change original values to new values
df['title'] = df['title'].map(normalized_titles)

# Calculate the mean age for non-null values
mean_age = df['age'].mean()

# Calculate the standard deviation of the age column, which will be used to generate random but believable age values
std_age = df['age'].std()

# Create a mask to identify rows with "Master" or "Miss" in the "title" column
master_miss_mask = (df['title'] == 'Master') | (df['title'] == 'Miss')

# Generate random values for rows with "Master" or "Miss" based on a different standard deviation
random_values_master_miss = np.random.normal(loc=0, scale=std_age * 0.5, size=master_miss_mask.sum())

# Shift the distribution to have the same mean as the original data
added_values_master_miss = random_values_master_miss + mean_age

# Update the 'age' column for rows with "Master" or "Miss" individually
master_miss_indices = df.index[master_miss_mask]
for i, index in enumerate(master_miss_indices):
    # Ensure that the age does not exceed 18
    age = min(added_values_master_miss[i], 18)
    df.loc[index, 'age'] = age

# For all other missing values, use the previously calculated random values
random_values = np.random.normal(loc=0, scale=std_age, size=df['age'].isna().sum())
added_values = random_values + mean_age

# Update the 'age' column for all other missing values individually
other_indices = df.index[~master_miss_mask & df['age'].isna()]
for i, index in enumerate(other_indices):
    df.loc[index, 'age'] = added_values[i]

# Change the datatype of the age column from float to int
df['age'] = df['age'].astype(int)


df['cabin'].fillna(0, inplace=True)

# Replace non-null values with 1 without having problems because there are non-numerical values
df['cabin'] = df['cabin'].apply(lambda x: 1 if x != 0 else x)

# There are 2 null values in the embarked column, because it is such a small amount of data we simply change it to the value 'Q'
# which stands for Queenstown
df['embarked'] = df['embarked'].replace(np.nan, 'Q')


df['boat'].fillna(0, inplace=True)

# Replace non-null values with 1 without having problems because there are non-numerical values
df['boat'] = df['boat'].apply(lambda x: 1 if x != 0 else x)


df['body'].fillna(0, inplace=True)

# Replace non-null values with 1 without having problems because there are non-numerical values
df['body'] = df['body'].apply(lambda x: 1 if x != 0 else x)
# We change the datatype from float to int
df['body'] = df['body'].astype(int)

df.drop('home.dest', axis = 1, inplace = True)

df['survived'] = df['survived'].astype(bool)
df['boat'] = df['boat'].astype(bool)
df['body'] = df['body'].astype(bool)
df['embarked'] = str(df['embarked'])

encoder = ce.OrdinalEncoder(cols=['sex'])
df_encoded = encoder.fit_transform(df)


# Self chosen model

# Gradient Boosting Regressor

I think that Gradient Boosting Regressor is a good technique to use for the Titanic dataset. They create an ensemble of weak learners to build a strong predictive model. These algorithms are known for their high predictive accuracy and can handle missing data and outliers gracefully.

# With boat and body

First we need to do all of our imports so we can actually use the gradient boosting regressor

In [37]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

Now we are going to define the features wich we want to use to make a prediction and the value we acctually want to predict. We place these in the variables x and y.

In [38]:
features = ['pclass','sex','age','sibsp','parch','fare','cabin','boat','body','ticket', 'embarked','firstname','title','lastname']
x = df_encoded[features]
y = df_encoded['survived']

The gradient boosting regressor can't handle text values. But in our dataset we do have text values. So with this line off code we are going to give all of these text values a number.

In [39]:
X = pd.get_dummies(x, columns=['ticket', 'embarked','firstname','title','lastname'], drop_first=True)

Then we are going to split the dataset into a test and a training set.

In [40]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

Now we are going to fit our model. We are using 100 estimators and random state 0.

In [41]:
gb_regressor = GradientBoostingRegressor(n_estimators=100, random_state=0)
gb_regressor.fit(x_train, y_train)

Here we are going to use our test set to make a predictions using the model we just created. We save it in a temporary variable because our model returns float values, but we want to have boolean values (or 0 and 1).

In [42]:
y_pred_prob = gb_regressor.predict(x_test)

To convert our float values to a boolean (or 0 and 1) we use the code below. everything that the model predicts as higher than 0.5 will be true and lower than 0.5 will be false.

In [43]:
threshold = 0.5
y_pred = (y_pred_prob > threshold).astype(int)

Now we are going to print the actual values and the predicted values so we can see how well our model did.

In [44]:
result_df = pd.DataFrame({'Actual': y_test[:25], 'Predicted': y_pred[:25]})

print(result_df)

      Actual  Predicted
1157   False          0
978     True          1
787    False          0
1023    True          1
316    False          0
482     True          1
5       True          1
645     True          1
420    False          0
430     True          1
286    False          0
828    False          0
31      True          1
678    False          0
970     True          1
57      True          1
715    False          0
835    False          0
911    False          0
998    False          0
946    False          0
1025    True          1
1244    True          1
829    False          0
805     True          1


# Without Boat and Body

I have created another model but without the body and boat values. This is because these values will have a verry big impact on the result. For example if you don't have a boat the chances of survival are almost 0. So our model will almost always say that you won't survive if you don't have a boat. So with this model we can see hopw well it works without these almost certain values.

In [45]:
features = ['pclass','sex','age','sibsp','parch','fare','cabin','ticket', 'embarked','firstname','title','lastname']
x = df_encoded[features]
y = df_encoded['survived']

### Create and test model

In [46]:
X = pd.get_dummies(x, columns=['ticket', 'embarked','firstname','title','lastname'], drop_first=True)

#Split into training and test set
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

#Fit model
gb_regressor = GradientBoostingRegressor(n_estimators=100, random_state=0)
gb_regressor.fit(x_train, y_train)

#Make predictions using test set
y_pred_prob = gb_regressor.predict(x_test)

#Convert float values to 0 or 1
threshold = 0.5
y_pred_no_boat = (y_pred_prob > threshold).astype(int)

#Print actual and predicted values
result_df = pd.DataFrame({'Actual': y_test[:25], 'Predicted': y_pred_no_boat[:25]})
print(result_df)

      Actual  Predicted
1157   False          0
978     True          0
787    False          0
1023    True          1
316    False          0
482     True          1
5       True          0
645     True          0
420    False          0
430     True          1
286    False          1
828    False          0
31      True          0
678    False          0
970     True          0
57      True          1
715    False          0
835    False          0
911    False          0
998    False          0
946    False          1
1025    True          1
1244    True          1
829    False          0
805     True          1


### Let's compare the models

As we can see are the predictions of the model with the boat much better than the predictions without.

In [47]:
result_df = pd.DataFrame({'Actual': y_test[:25],'Predicted with boat': y_pred[:25], 'Predicted without boat': y_pred_no_boat[:25]})
print(result_df)

      Actual  Predicted with boat  Predicted without boat
1157   False                    0                       0
978     True                    1                       0
787    False                    0                       0
1023    True                    1                       1
316    False                    0                       0
482     True                    1                       1
5       True                    1                       0
645     True                    1                       0
420    False                    0                       0
430     True                    1                       1
286    False                    0                       1
828    False                    0                       0
31      True                    1                       0
678    False                    0                       0
970     True                    1                       0
57      True                    1                       1
715    False  

In [48]:
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error with boat: {mse}")

r2 = r2_score(y_test, y_pred)
print(f"R-squared with boat: {r2}")

mse = mean_squared_error(y_test, y_pred_no_boat)
print(f"Mean Squared Error without boat: {mse}")

r2 = r2_score(y_test, y_pred_no_boat)
print(f"R-squared without boat: {r2}")

Mean Squared Error with boat: 0.01532567049808429
R-squared with boat: 0.933494712702255
Mean Squared Error without boat: 0.16091954022988506
R-squared without boat: 0.3016944833736783
