In [3]:
import pandas as pd
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.utils import np_utils
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras import layers

In [4]:
# Step 0: Read the train dataset and get a general idea on how it looks like
df = pd.read_csv('./Data/train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
# Step 1: Understand the general profile of this dataset and how consistently populated are its features

In [6]:
#profile = ProfileReport(df, title="Pandas Profiling Report", explorative=True)
#profile.to_file('RawDataProfile.html')

In [7]:
# There are many blanks in the 'Age' and in the 'Cabin' columns - Populating these features would be a bit troublesome - Discarding is an option for the Cabin column, since the amount of nulls is significantly higher

In [8]:
# Step 2: Feature Engineering - Curating and adding new features from the raw data

In [9]:
# Adding two new features: Family and Entitlement

In [10]:
df['Family'] = df['Name'].str.split(',').str.get(0)
df['Entitlement'] = df['Name'].str.split(',').str.get(1).str.split(' ').str.get(1)

In [11]:
df['Entitlement'].unique()

array(['Mr.', 'Mrs.', 'Miss.', 'Master.', 'Don.', 'Rev.', 'Dr.', 'Mme.',
       'Ms.', 'Major.', 'Lady.', 'Sir.', 'Mlle.', 'Col.', 'Capt.', 'the',
       'Jonkheer.'], dtype=object)

In [12]:
# Trying to get an idea of how many people there are in each family
df[['Family', 'Name']].groupby('Family').size().reset_index(name='FamilySize').sort_values(by=['FamilySize'], ascending=False).head()

Unnamed: 0,Family,FamilySize
16,Andersson,9
532,Sage,7
563,Skoog,6
100,Carter,6
220,Goodwin,6


In [13]:
# Trying to get an idea of how many people there are by entitlement
df[['Entitlement', 'Name']].groupby('Entitlement').size().reset_index(name='qt').sort_values(by=['qt'], ascending=False).head()

Unnamed: 0,Entitlement,qt
11,Mr.,517
8,Miss.,182
12,Mrs.,125
7,Master.,40
3,Dr.,7


In [14]:
# Trying to get an idea of how many people survived in each family
df['Survived'] = df['Survived'].astype('int32')
df[['Family', 'Survived']].groupby('Family').sum().reset_index().sort_values(by=['Survived'], ascending=False).head()

Unnamed: 0,Family,Survived
100,Carter,4
32,Baclini,4
292,Johnson,3
306,Kelly,3
239,Harper,3


In [15]:
# Trying to get an idea of how many people survived by each entitlement
df[['Entitlement', 'Survived']].groupby('Entitlement').sum().reset_index().sort_values(by=['Survived'], ascending=False).head()

Unnamed: 0,Entitlement,Survived
8,Miss.,127
12,Mrs.,99
11,Mr.,81
7,Master.,23
3,Dr.,3


In [16]:
# Idea: add a column named 'FamilySize' to the original dataframe
df_family = df[['Family', 'Name']].groupby('Family').size().reset_index(name='FamilySize')
df = pd.merge(df, df_family, on='Family', how='left')

In [17]:
# Removing the 'Name' and the 'Family' columns now as they became unnecessary
df.drop('Name', axis=1, inplace=True)
df.drop('Family', axis=1, inplace=True)

In [18]:
# Trying to understand the different types of families
df[['SibSp', 'Parch', 'FamilySize']].drop_duplicates().head()

Unnamed: 0,SibSp,Parch,FamilySize
0,1,0,2
1,1,0,1
2,0,0,1
4,0,0,2
5,0,0,3


In [19]:
df[['SibSp', 'Parch', 'FamilySize','PassengerId']].groupby(['SibSp', 'Parch', 'FamilySize']).count().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,PassengerId
SibSp,Parch,FamilySize,Unnamed: 3_level_1
0,0,1,454
0,0,2,41
0,0,3,20
0,0,4,18
0,0,6,3


In [20]:
df['FamilyCategory'] = np.select(
    [
        (df['SibSp'] == 1) & (df['Parch'] == 0), 
        (df['SibSp'] == 0) & (df['Parch'] == 1),
        (df['SibSp'] > 1 )& (df['Parch'] == 0),
        (df['SibSp'] == 0) & (df['Parch'] > 1),
        (df['SibSp'] == 0) & (df['Parch'] == 0) & (df['FamilySize'] != 1),
        (df['SibSp'] == 0) & (df['Parch'] == 0) & (df['FamilySize'] == 1)
    ], 
    [
        'Couple', 
        'Couple',
        'Couple and Children',
        'Couple and Children',
        'Relatives',
        'Single person'
    ], 
    default='Single person' # defaulting to 'Single Person' as most people were by themselves
)

In [21]:
# Filling in the null values for Age based in each family category

In [22]:
df.loc[df['Age'].isna()].groupby('FamilyCategory').count()['PassengerId']

FamilyCategory
Couple                  22
Couple and Children      6
Relatives               22
Single person          127
Name: PassengerId, dtype: int64

In [23]:
df.loc[~df['Age'].isna()].groupby('FamilyCategory').median()['Age']

FamilyCategory
Couple                 29.0
Couple and Children    26.0
Relatives              29.0
Single person          28.0
Name: Age, dtype: float64

In [24]:
df.loc[df.FamilyCategory.eq('Couple') & df.Age.isna()] = df.loc[df.FamilyCategory.eq('Couple') & df.Age.isna()].fillna(29)
df.loc[df.FamilyCategory.eq('Couple and Children') & df.Age.isna()] = df.loc[df.FamilyCategory.eq('Couple and Children') & df.Age.isna()].fillna(26);
df.loc[df.FamilyCategory.eq('Relatives') & df.Age.isna()] = df.loc[df.FamilyCategory.eq('Relatives') & df.Age.isna()].fillna(29);
df.loc[df.FamilyCategory.eq('Single person') & df.Age.isna()] = df.loc[df.FamilyCategory.eq('Single person') & df.Age.isna()].fillna(28);

In [25]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Entitlement,FamilySize,FamilyCategory
0,1,0,3,male,22.0,1,0,A/5 21171,7.25,,S,Mr.,2,Couple
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs.,1,Couple
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss.,1,Single person
3,4,1,1,female,35.0,1,0,113803,53.1,C123,S,Mrs.,2,Couple
4,5,0,3,male,35.0,0,0,373450,8.05,,S,Mr.,2,Relatives


In [26]:
df[['Fare', 'Cabin']].loc[~df['Cabin'].isna()]

Unnamed: 0,Fare,Cabin
1,71.2833,C85
3,53.1000,C123
5,8.4583,29
6,51.8625,E46
10,16.7000,G6
...,...,...
878,7.8958,28
879,83.1583,C50
887,30.0000,B42
888,23.4500,28


In [27]:
# The 'Cabin' column is very badly populated. My decision will be to drop it.

In [28]:
df.drop('Cabin', axis=1, inplace=True)

In [29]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Entitlement,FamilySize,FamilyCategory
0,1,0,3,male,22.0,1,0,A/5 21171,7.25,S,Mr.,2,Couple
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C,Mrs.,1,Couple
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,S,Miss.,1,Single person
3,4,1,1,female,35.0,1,0,113803,53.1,S,Mrs.,2,Couple
4,5,0,3,male,35.0,0,0,373450,8.05,S,Mr.,2,Relatives


In [30]:
# The 'Ticket' column has a very high cardinality, making it difficult to use it for classification. My decision will also be to drop it

In [31]:
df.drop('Ticket', axis=1, inplace=True)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Entitlement,FamilySize,FamilyCategory
0,1,0,3,male,22.0,1,0,7.25,S,Mr.,2,Couple
1,2,1,1,female,38.0,1,0,71.2833,C,Mrs.,1,Couple
2,3,1,3,female,26.0,0,0,7.925,S,Miss.,1,Single person
3,4,1,1,female,35.0,1,0,53.1,S,Mrs.,2,Couple
4,5,0,3,male,35.0,0,0,8.05,S,Mr.,2,Relatives


In [32]:
# The data seems to be in a much better shape now for training a model

In [33]:
# Generating a new profiling report for analysis

In [34]:
#profile = ProfileReport(df, title="Pandas Profiling Report", explorative=True)
#profile.to_file('CuratedDataProfile.html')

In [35]:
# Getting the data ready for training: Applying one-hot encoding to the categorical fields

In [36]:
df = pd.get_dummies(df)

In [37]:
X = df.drop('Survived', axis=1)
y = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=1, stratify=y)

In [38]:
# Standardizing and scaling the data

In [39]:
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

In [40]:
# Step 3: Model Training

In [41]:
batch_size = 32
num_classes = 2
epochs = 20

In [42]:
# Trains a deep NN on the dataset
model = Sequential()
# Rectified Linear Unit (ReLU) as the 1st Activation Function
# What it does is essentially outputting the input directly if it is positive, otherwise, it will output zero
model.add(Dense(256, activation='relu', input_dim=33))
# Softmax function as the 2nd Activation Function 
# What it does it essentially normalizing the output of a network to a probability distribution over the predicted output classes
model.add(Dense(1, activation='softmax'))

model.summary()

model.compile(loss='binary_crossentropy',
              optimizer=RMSprop(),
              metrics=['accuracy'])

history = model.fit(X_train_std, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(X_test_std, y_test))

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 256)               8704      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 257       
Total params: 8,961
Trainable params: 8,961
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [43]:
score = model.evaluate(X_test_std, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 0.4895698130130768
Test accuracy: 0.38432836532592773


In [44]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss']) 
plt.title('Model loss');
plt.ylabel('Loss');
plt.xlabel('Epoch'); 
plt.legend(['Train', 'Test'], loc='upper left');
plt.show()

  plt.show()
