# MLP - Titanic

In [1]:
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [2]:
# We are reading a clean dataset from titanic generated from 05-dm-filtering.ipynb

df = pd.read_csv("../data/titanic-clean.csv")
df.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [3]:
df.shape

(889, 11)

In [4]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


## Categorical Features

In [5]:
select_features = ["Survived", "Pclass", "Sex", "Age", "SibSp", "Fare"]
categorical_features = ["Pclass", "Sex"]

In [6]:
df = df[select_features]
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare
0,0,3,male,22.0,1,7.25
1,1,1,female,38.0,1,71.2833
2,1,3,female,26.0,0,7.925
3,1,1,female,35.0,1,53.1
4,0,3,male,35.0,0,8.05


## One-Hot-Encoder

In [7]:
one_hot_encoder = OneHotEncoder(handle_unknown='ignore')
one_hot_encoder.fit(df[categorical_features])
one_hot_encoder

In [8]:
# retrieving the column names from the OneHotEncoder

column_names = one_hot_encoder.get_feature_names_out(categorical_features)
column_names

array(['Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male'],
      dtype=object)

In [9]:
# applying the one-hot-encoder on the categorical columns

transformed_data = one_hot_encoder.transform(df[categorical_features])
transformed_data

<889x5 sparse matrix of type '<class 'numpy.float64'>'
	with 1778 stored elements in Compressed Sparse Row format>

In [10]:
# Converting the sparse matrix to dataframe

df_transformed = pd.DataFrame.sparse.from_spmatrix(transformed_data,
                                                   columns=column_names)
df_transformed

Unnamed: 0,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male
0,0.0,0.0,1.0,0.0,1.0
1,1.0,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,1.0,0.0
3,1.0,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...
884,0.0,1.0,0.0,0.0,1.0
885,1.0,0.0,0.0,1.0,0.0
886,0.0,0.0,1.0,1.0,0.0
887,1.0,0.0,0.0,0.0,1.0


In [11]:
# extracting only the columns that didnt need to be encoded

df = df.drop(columns=categorical_features)
df.head()

Unnamed: 0,Survived,Age,SibSp,Fare
0,0,22.0,1,7.25
1,1,38.0,1,71.2833
2,1,26.0,0,7.925
3,1,35.0,1,53.1
4,0,35.0,0,8.05


In [12]:
# merging/joing the transformed data frame with categorical data with a datafram containing non-categorical features

final_df = df.join(df_transformed)
final_df.head()

Unnamed: 0,Survived,Age,SibSp,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male
0,0,22.0,1,7.25,0.0,0.0,1.0,0.0,1.0
1,1,38.0,1,71.2833,1.0,0.0,0.0,1.0,0.0
2,1,26.0,0,7.925,0.0,0.0,1.0,1.0,0.0
3,1,35.0,1,53.1,1.0,0.0,0.0,1.0,0.0
4,0,35.0,0,8.05,0.0,0.0,1.0,0.0,1.0


## Converting Sparse into Int

In [13]:
final_df[column_names] = final_df[column_names].sparse.to_dense()

In [14]:
conversion = {}
for col in column_names:
    conversion[col] = 'int8'

final_df = final_df.astype(conversion)

In [15]:
final_df.dtypes

Survived        int64
Age           float64
SibSp           int64
Fare          float64
Pclass_1         int8
Pclass_2         int8
Pclass_3         int8
Sex_female       int8
Sex_male         int8
dtype: object

## Scaling

In [16]:
final_df.describe()

Unnamed: 0,Survived,Age,SibSp,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male
count,889.0,889.0,889.0,889.0,889.0,889.0,889.0,889.0,889.0
mean,0.382452,29.535624,0.524184,32.096681,0.24072,0.206974,0.552306,0.350956,0.649044
std,0.48626,14.527483,1.103705,49.697504,0.427761,0.405365,0.497536,0.477538,0.477538
min,0.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,20.0,0.0,7.8958,0.0,0.0,0.0,0.0,0.0
50%,0.0,28.0,0.0,14.4542,0.0,0.0,1.0,0.0,1.0
75%,1.0,38.0,1.0,31.0,0.0,0.0,1.0,1.0,1.0
max,1.0,80.0,8.0,512.3292,1.0,1.0,1.0,1.0,1.0


### MinMaxScaler

If we have a series of data between 0 and 10
We can simply divide them all by 10 to have values between 0 and 1.

Now, if the data is spread between 10 and 20, then we need to shift data
by 10 units to make them first between 0 and 10, then divide them all by 10.

**Formula:**

$MinMaxScaler = \frac{x - min}{max - min}$

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html

In [17]:
columns_to_scale = ["Age", "SibSp", "Fare"]

scaler = MinMaxScaler(clip=True)
final_df[columns_to_scale] = scaler.fit_transform(final_df[columns_to_scale])

final_df.head()

Unnamed: 0,Survived,Age,SibSp,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male
0,0,0.271174,0.125,0.014151,0,0,1,0,1
1,1,0.472229,0.125,0.139136,1,0,0,1,0
2,1,0.321438,0.0,0.015469,0,0,1,1,0
3,1,0.434531,0.125,0.103644,1,0,0,1,0
4,0,0.434531,0.0,0.015713,0,0,1,0,1


In [18]:
final_df.describe()

Unnamed: 0,Survived,Age,SibSp,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male
count,889.0,889.0,889.0,889.0,889.0,889.0,889.0,889.0,889.0
mean,0.382452,0.365866,0.065523,0.062649,0.24072,0.206974,0.552306,0.350956,0.649044
std,0.48626,0.182552,0.137963,0.097003,0.427761,0.405365,0.497536,0.477538,0.477538
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.246042,0.0,0.015412,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.346569,0.0,0.028213,0.0,0.0,1.0,0.0,1.0
75%,1.0,0.472229,0.125,0.060508,0.0,0.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [19]:
# Testing with Age: 100, SibSp: 7, Fare: 250
# Since clip = True, Age will be transformed to 1
# eventhough the maximum age = 80

scaler.transform([[100, 7, 250]])



array([[1.       , 0.875    , 0.4879675]])

## Train / Test Split

In [20]:
X = final_df.drop(columns=["Survived"])
y = final_df["Survived"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [21]:
X_train.shape

(711, 8)

In [22]:
X_test.shape

(178, 8)

## Training

In [36]:
# Fitting model based on the train data
model = MLPClassifier(random_state=42, max_iter=300, hidden_layer_sizes=(20,20,)).fit(X_train, y_train)



In [37]:
model.score(X_train, y_train)

0.8270042194092827

In [38]:
# checking the score
# the result is between 0 and 1: 1 means 100% accurate
model.score(X_test, y_test)

0.8146067415730337