In [11]:
import pandas as pd
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import plot_confusion_matrix, confusion_matrix
import numpy as np
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.utils import np_utils
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

ModuleNotFoundError: No module named 'tensorflow.keras'

# Step 0: Read the train dataset and get a general idea on how it looks like

In [None]:
df = pd.read_csv('Data/train.csv')
df.head()

# Step 1: Understand the general profile of this dataset and how consistently populated are its features

In [None]:
#profile = ProfileReport(df, title="Pandas Profiling Report", explorative=True)
#profile.to_file('RawDataProfile.html')

In [None]:
# There are many blanks in the 'Age' and in the 'Cabin' columns - Populating these features would be a bit troublesome - Discarding is an option for the Cabin column, since the amount of nulls is significantly higher

# Step 2: Feature Engineering - Curating and adding new features from the raw data

In [None]:
# Adding two new features: Family and Entitlement

In [None]:
df['Family'] = df['Name'].str.split(',').str.get(0)
df['Entitlement'] = df['Name'].str.split(',').str.get(1).str.split(' ').str.get(1)

In [None]:
df['Entitlement'].unique()

In [None]:
# Trying to get an idea of how many people there are in each family
df[['Family', 'Name']].groupby('Family').size().reset_index(name='FamilySize').sort_values(by=['FamilySize'], ascending=False).head()

In [None]:
# Trying to get an idea of how many people there are by entitlement
df[['Entitlement', 'Name']].groupby('Entitlement').size().reset_index(name='qt').sort_values(by=['qt'], ascending=False).head()

In [None]:
# Trying to get an idea of how many people survived in each family
df['Survived'] = df['Survived'].astype('int32')
df[['Family', 'Survived']].groupby('Family').sum().reset_index().sort_values(by=['Survived'], ascending=False).head()

In [None]:
# Trying to get an idea of how many people survived by each entitlement
df[['Entitlement', 'Survived']].groupby('Entitlement').sum().reset_index().sort_values(by=['Survived'], ascending=False).head()

In [None]:
# Idea: add a column named 'FamilySize' to the original dataframe
df_family = df[['Family', 'Name']].groupby('Family').size().reset_index(name='FamilySize')
df = pd.merge(df, df_family, on='Family', how='left')

In [None]:
# Removing the 'Name' and the 'Family' columns now as they became unnecessary
df.drop('Name', axis=1, inplace=True)
df.drop('Family', axis=1, inplace=True)

In [None]:
# Trying to understand the different types of families
df[['SibSp', 'Parch', 'FamilySize']].drop_duplicates().head()

In [None]:
df[['SibSp', 'Parch', 'FamilySize','PassengerId']].groupby(['SibSp', 'Parch', 'FamilySize']).count().head()

In [None]:
df['FamilyCategory'] = np.select(
    [
        (df['SibSp'] == 1) & (df['Parch'] == 0), 
        (df['SibSp'] == 0) & (df['Parch'] == 1),
        (df['SibSp'] > 1 )& (df['Parch'] == 0),
        (df['SibSp'] == 0) & (df['Parch'] > 1),
        (df['SibSp'] == 0) & (df['Parch'] == 0) & (df['FamilySize'] != 1),
        (df['SibSp'] == 0) & (df['Parch'] == 0) & (df['FamilySize'] == 1)
    ], 
    [
        'Couple', 
        'Couple',
        'Couple and Children',
        'Couple and Children',
        'Relatives',
        'Single person'
    ], 
    default='Single person' # defaulting to 'Single Person' as most people were by themselves
)

In [None]:
# Filling in the null values for Age based in each family category

In [None]:
df.loc[df['Age'].isna()].groupby('FamilyCategory').count()['PassengerId']

In [None]:
df.loc[~df['Age'].isna()].groupby('FamilyCategory').median()['Age']

In [None]:
df.loc[df.FamilyCategory.eq('Couple') & df.Age.isna()] = df.loc[df.FamilyCategory.eq('Couple') & df.Age.isna()].fillna(29)
df.loc[df.FamilyCategory.eq('Couple and Children') & df.Age.isna()] = df.loc[df.FamilyCategory.eq('Couple and Children') & df.Age.isna()].fillna(26);
df.loc[df.FamilyCategory.eq('Relatives') & df.Age.isna()] = df.loc[df.FamilyCategory.eq('Relatives') & df.Age.isna()].fillna(29);
df.loc[df.FamilyCategory.eq('Single person') & df.Age.isna()] = df.loc[df.FamilyCategory.eq('Single person') & df.Age.isna()].fillna(28);

In [None]:
df.head()

In [None]:
df[['Fare', 'Cabin']].loc[~df['Cabin'].isna()]

In [None]:
# The 'Cabin' column is very badly populated. My decision will be to drop it.

In [None]:
df.drop('Cabin', axis=1, inplace=True)

In [None]:
df.head()

In [None]:
# The 'Ticket' column has a very high cardinality, making it difficult to use it for classification. My decision will also be to drop it

In [None]:
df.drop('Ticket', axis=1, inplace=True)
df.head()

In [None]:
# Getting the data ready for training: Applying one-hot encoding to the categorical fields

In [None]:
df = pd.get_dummies(df)

In [None]:
# Removing an unnecessary entitlement
df.drop('Entitlement_the', axis=1, inplace=True)
# Adding a different entitlement which can be found on the test dataset
df['Entitlement_Dona.'] = 0

In [None]:
# Generating a new profiling report for analysis

In [None]:
#profile = ProfileReport(df, title="Pandas Profiling Report", explorative=True)
#profile.to_file('CuratedDataProfile.html')

In [None]:
# The data seems to be in a much better shape now for training a model!

In [None]:
X = df.drop('Survived', axis=1).copy()
y = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1, stratify=y)

In [None]:
# Standardizing, casting and scaling the data

In [None]:
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)
input_dim = len(X_train.columns)

num_classes = 2

# convert class vectors to binary class matrices
y_train_arr = keras.utils.np_utils.to_categorical(y_train, num_classes)
y_test_arr = keras.utils.np_utils.to_categorical(y_test, num_classes)

# Step 3: Model Training and evaluation

In [None]:
# Functions to calculate accuracy metrics
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall
def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision
def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
# Trains a deep NN on the dataset
def get_model(qt_relu_layers, optimizer, epochs):
    model = Sequential()
    # Rectified Linear Unit (ReLU) as the 1st Activation Function
    # What it does is essentially outputting the input directly if it is positive, otherwise, it will output ero
    model.add(Dense(qt_relu_layers, activation='relu', input_dim=input_dim))
    # Intermediate layer
    model.add(Dense(qt_relu_layers/2, activation='relu'))
    # Softmax function as the last Activation Function 
    # What it does it essentially normalizing the output of a network to a probability distribution over the        redicted output classes
    model.add(Dense(num_classes, activation='softmax'))
    model.summary()
    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy', f1_m])
    return model

In [None]:
batch_size = 64

In [None]:
batch_size = 128
epochs = 20
optimizer = 'RMSprop'
model = get_model(batch_size, optimizer, epochs)
history = model.fit(X_train_std, y_train_arr,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=True,
                    validation_data=(X_test_std, y_test_arr))
score = model.evaluate(X_test_std, y_test_arr, verbose=0)
print('-----------------------------------------------------')
print('Test loss:', score[0])
print('Test accuracy:', score[1])
print('Test F1 score:', score[2])

In [None]:
%matplotlib inline
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy']) 
plt.title('Model Accuracy');
plt.ylabel('Accuracy');
plt.xlabel('Epoch'); 
plt.legend(['Train', 'Test'], loc='upper left');

In [None]:
# Using GridSearch to determine the best parameters and best train score (commented out to save computing resources when executing everything)
'''
param_grid = {'qt_relu_layers': [2, 4, 16, 32, 64, 128],
            'optimizer': ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam'],
            'epochs': [2, 5, 10, 20, 30, 40, 50, 100]
}

model = KerasClassifier(build_fn=get_model, verbose=False, batch_size=batch_size)

use_all_processors = -1
gs = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=use_all_processors)
                                                  
gs.fit(X_train, y_train_arr)

print(gs.best_params_)
print(gs.best_score_)
'''

# Step 4: Training and evaluating the model that uses the best parameters found on the Grid Search

In [None]:
qt_relu_layers = 128
epochs = 400
optimizer = 'Adam'
model = get_model(qt_relu_layers, optimizer, epochs)
history = model.fit(X_train_std, y_train_arr,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=False,
                    validation_data=(X_test_std, y_test_arr))
score = model.evaluate(X_test_std, y_test_arr, verbose=0)
print('-----------------------------------------------------')
print('Test set loss:', score[0])
print('Test set accuracy:', score[1])
print('Test set F1 score:', score[2])

In [None]:
%matplotlib inline
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy']) 
plt.title('Model Accuracy');
plt.ylabel('Accuracy');
plt.xlabel('Epoch'); 
plt.legend(['Train', 'Test'], loc='upper left');

In [None]:
print("Confusion Matrix for the Test Data:")
y_test_arg = np.argmax(y_test_arr, axis=1)
y_pred = np.argmax(model.predict(X_test_std), axis=1)
print(confusion_matrix(y_test_arg, y_pred))

# Step 5: Applying the model to the validation dataset

In [None]:
df_val = pd.read_csv('Data/validation.csv')

In [None]:
df_val['Family'] = df_val['Name'].str.split(',').str.get(0)
df_val['Entitlement'] = df_val['Name'].str.split(',').str.get(1).str.split(' ').str.get(1)

In [None]:
df_family = df_val[['Family', 'Name']].groupby('Family').size().reset_index(name='FamilySize')
df_val = pd.merge(df_val, df_family, on='Family', how='left')

In [None]:
# Removing the 'Name' and the 'Family' columns now as they became unnecessary
df_val.drop('Name', axis=1, inplace=True)
df_val.drop('Family', axis=1, inplace=True)

In [None]:
df_val[['SibSp', 'Parch', 'FamilySize','PassengerId']].groupby(['SibSp', 'Parch', 'FamilySize']).count().head()

In [None]:
df_val['FamilyCategory'] = np.select(
    [
        (df_val['SibSp'] == 1) & (df_val['Parch'] == 0), 
        (df_val['SibSp'] == 0) & (df_val['Parch'] == 1),
        (df_val['SibSp'] > 1 )& (df_val['Parch'] == 0),
        (df_val['SibSp'] == 0) & (df_val['Parch'] > 1),
        (df_val['SibSp'] == 0) & (df_val['Parch'] == 0) & (df_val['FamilySize'] != 1),
        (df_val['SibSp'] == 0) & (df_val['Parch'] == 0) & (df_val['FamilySize'] == 1)
    ], 
    [
        'Couple', 
        'Couple',
        'Couple and Children',
        'Couple and Children',
        'Relatives',
        'Single person'
    ], 
    default='Single person' # defaulting to 'Single Person' as most people were by themselves
)

In [None]:
df_val.loc[~df_val['Age'].isna()].groupby('FamilyCategory').median()['Age']

In [None]:
df_val.loc[df_val.FamilyCategory.eq('Couple') & df_val.Age.isna()] = df_val.loc[df_val.FamilyCategory.eq('Couple') & df_val.Age.isna()].fillna(30);
df_val.loc[df_val.FamilyCategory.eq('Couple and Children') & df_val.Age.isna()] = df_val.loc[df_val.FamilyCategory.eq('Couple and Children') & df_val.Age.isna()].fillna(27);
df_val.loc[df_val.FamilyCategory.eq('Relatives') & df_val.Age.isna()] = df_val.loc[df_val.FamilyCategory.eq('Relatives') & df_val.Age.isna()].fillna(24);
df_val.loc[df_val.FamilyCategory.eq('Single person') & df_val.Age.isna()] = df_val.loc[df_val.FamilyCategory.eq('Single person') & df_val.Age.isna()].fillna(27);

In [None]:
df_val.drop('Cabin', axis=1, inplace=True)

In [None]:
df_val.drop('Ticket', axis=1, inplace=True)

In [None]:
df_val = pd.get_dummies(df_val)

In [None]:
df_val.head()

In [None]:
df_val['Entitlement_Capt.'] = 0
df_val['Entitlement_Don.'] = 0
df_val['Entitlement_Mlle.'] = 0
df_val['Entitlement_Mme.'] = 0
df_val['Entitlement_Jonkheer.'] = 0
df_val['Entitlement_Lady.'] = 0
df_val['Entitlement_Major.'] = 0
df_val['Entitlement_Sir.'] = 0

In [None]:
X_val = df_val

In [None]:
sc = StandardScaler()
sc.fit(X_val)
X_val_std = sc.transform(X_val)

In [None]:
sc.fit(X)
X_std = sc.transform(X)
y_arr = keras.utils.np_utils.to_categorical(y, num_classes)

In [None]:
# Using GridSearch to determine the best parameters and best train score (commented out to save computing resources when executing everything)
'''
param_grid = {
  'qt_relu_layers': [2, 4, 16, 32, 64, 128],
  'optimizer': ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam'],
  'epochs': [2, 5, 10, 20, 30, 40, 50, 100, 200]
}

model = KerasClassifier(build_fn=get_model, verbose=False, batch_size=batch_size, epochs=epochs)

use_all_processors = -1
gs = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=use_all_processors)
                                                  
gs.fit(X_std, y_arr)

print(gs.best_params_)
print(gs.best_score_)
'''

In [None]:
# Training our model with the best NN
qt_relu_layers = 128
optimizer = 'RMSprop'
epochs = 200
model = get_model(qt_relu_layers, optimizer, epochs)
model.fit(X_std, y_arr,
            batch_size=batch_size,
            epochs=epochs,
            verbose=False
        )

In [None]:
y_pred = model.predict(X_val_std)

In [None]:
y_pred = np.argmax(y_pred, axis=1)

In [None]:
y_pred = pd.DataFrame(y_pred)
y_pred.columns = ['Survived']

In [None]:
df_results = pd.concat([X_val['PassengerId'], y_pred], axis=1)

In [None]:
df_results.to_csv('Results.csv', index=False)