# Home task: Kaggle Competition [Titanic - Machine Learning from Disaster](https://www.kaggle.com/c/titanic)

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
%matplotlib inline

In [None]:
CWD = os.getcwd()
DATA_DIR = os.path.join(CWD, 'titanic_data')
TRAIN_FILE = os.path.join(DATA_DIR, 'train.csv')
TEST_FILE = os.path.join(DATA_DIR, 'test.csv')

### Exploratory Data Analysis (EDA)

In [None]:
# Load train dataset
train_data = pd.read_csv(TRAIN_FILE)
train_data.head()

In the train dataset we have 12 columns:
 - `PassengerId` - passenger id
 - `Survived` - survival (0 = No, 1 = Yes)
 - `Pclass` - ticket class (1 = 1st, 2 = 2nd, 3 = 3rd): a proxy for socio-economic status (1st = upper, 2nd = middle, 3rd = lower)
 - `Name` - passenger name
 - `Sex` - passenger sex
 - `Age` - passenger age in years: age is fractional if less than 1; if age is estimated, is it in the form of xx.5
 - `SibSp` - number of siblings/spouses aboard the Titanic: sibling = brother, sister, stepbrother, stepsister; spouse = husband, wife (mistresses and fiancés were ignored)
 - `Parch` - number of parents/children aboard the Titanic: parent = mother, father; child = daughter, son, stepdaughter, stepson; some children travelled only with a nanny, therefore parch = 0 for them
 - `Ticket` - ticket number
 - `Fare` - passenger fare
 - `Cabin` - cabin number
 - `Embarked` - port of embarkation	(C = Cherbourg, Q = Queenstown, S = Southampton)

Columns `PassengerId`, `Name`, `Ticket` and `Cabin` are irrelevent for prediction whether the passenger were survived, so we will ignore them.

Now let's build the plots of column values depending on whether the passenger were survived.

In [None]:
plt.figure(figsize=(13, 4))

# Bar chart of Pclass column depending on passenger survival
plt.subplot(1, 3, 1, ylabel='Count')
sns.countplot(train_data, x='Pclass', hue='Survived')

# Histogram of Age column depending on passenger survival
plt.subplot(1, 3, 2)
sns.histplot(train_data, x='Age', hue='Survived', bins=10)

# Bar chart of Sex column depending on passenger survival
plt.subplot(1, 3, 3, ylabel='Count')
sns.countplot(train_data, x='Sex', hue='Survived');

The first plot (`Pclass` column) shows us that the higher the ticket class (from 3rd to 1st), the higher the survival rate. In the 1st class number of passengers who survived higher than the number of passengers who did not survived.

In the second plot (`Age` column) we can see that survival rate is low among adult passengers. However, most child passengers under the age of 10 were survived.

The third plot (`Sex` column) shows that the survival rate among female passengers is higher than male passengers.

In [None]:
plt.figure(figsize=(13, 4))

# Histogram of SibSp column depending on passenger survival
plt.subplot(1, 3, 1)
sns.histplot(train_data, x='SibSp', hue='Survived', discrete=True)

# Histogram of Parch column depending on passenger survival
plt.subplot(1, 3, 2)
sns.histplot(train_data, x='Parch', hue='Survived', discrete=True)

# Histogram of Fare column depending on passenger survival
plt.subplot(1, 3, 3)
sns.histplot(train_data, x='Fare', hue='Survived', bins=10);

In the first and second plots (columns `SibSp` and `Parch`) we can see that number of siblings/spouses or parents/children aboard does not affect the survival ratio. However, it is worth noting that for passengers who had one sibling/spouse or parent/child aboard survival ratio is higher.

The third plot (`Fare` column) shows us that the survival ratio is higher among passengers who have paid between 50 and 150.

In [None]:
# Bar chart of Embarked column depending on passenger survival
plt.figure(figsize=(4, 4))
plt.ylabel('Count')
sns.countplot(train_data, x='Embarked', hue='Survived');

This plot shows that the survival ratio is the highest among passengers who have embarked from Cherbourg.

### Data Cleaning

Before we start cleaning the data, we need to analyze the data types in different columns determine if there are NA values in the dataset.

In [None]:
train_data.info()

In [None]:
train_data.isna().sum()

So, `Age` column has 177 NA values and `Embarked` column has 2 NA values (`Cabin` column is ignored). For further forecasting we need fill in the NA values in these columns with the aggregated values from dataset.

Let's create a new column which contains passenger initials (honorifics): Mr, Miss, Mrs, Sir, Lady etc.

In [None]:
initials = train_data['Name'].str.extract('(\w+)\.').squeeze()
initials.unique()

Now, we reduce the number of possible passenger initials.

In [None]:
initials.replace(
    ['Mlle', 'Mme', 'Ms'] +                  # To Miss
    ['Lady', 'Countess'] +                   # To Mrs
    ['Dr', 'Major', 'Capt', 'Sir', 'Don'] +  # To Mr
    ['Jonkheer', 'Col', 'Rev'],              # To Other
    ['Miss'] * 3 + ['Mrs'] * 2 + ['Mr'] * 5 + ['Other'] * 3,
    inplace=True
)
initials.unique()

Then we can fill in the NA values in `Age` column with the age mean values for different passenger initials.

In [None]:
# Age mean values for different passenger initials (estimation can be xx.0 or xx.5)
age_means = train_data.groupby(initials)['Age'].mean().round(1)
age_means - (age_means % 0.5)

We can fill in the NA values in `Embarked` column with the most common value in this column.

In [None]:
# Most common embarkation value (mode)
train_data['Embarked'].value_counts().index[0]

Now, we investigate the test dataset.

In [None]:
# Load test dataset
test_data = pd.read_csv(TEST_FILE)
test_data.info()

Here we will also determine if the dataset contains NA values.

In [None]:
test_data.isna().sum()

In the test dataset `Age` column has 86 NA values and `Fare` column has one NA value.

To fill in the NA values in `Age` column, we follow the same process.

In [None]:
initials = test_data['Name'].str.extract('(\w+)\.').squeeze()
initials.unique()

In [None]:
initials.replace(
    ['Ms'] +            # To Miss
    ['Lady', 'Dona'] +  # To Mrs
    ['Dr'] +            # To Mr
    ['Col', 'Rev'],     # To Other
    ['Miss'] + ['Mrs'] * 2 + ['Mr'] + ['Other'] * 2,
    inplace=True
)
initials.unique()

In [None]:
# Age mean values for different passenger initials (estimation can be xx.0 or xx.5)
age_means = test_data.groupby(initials)['Age'].mean().round(1)
age_means - (age_means % 0.5)

To fill in the NA values in `Fare` column, we can use the average value in this column.

In [None]:
# Average fare value
test_data['Fare'].mean()

To be able to apply cleaning process to train and test dataset, we create clean_titanic_data function.

In [None]:
def clean_titanic_data(data, return_y=True):
    '''
    Performs data cleaning in titanic dataset.

    :param data: A dataset
    :type data: DataFrame
    :param return_y: If `True`, returns `(ids, X, y)`, if `False`, returns `(ids, X)`
    :type return_y: bool

    :raises TypeError: Occurs if `data` does not contain `Survived` column when `return_y=True`

    :return: `(ids, X, y)` or `(ids, X)` where `ids` is passenger ids, ndarray of shape (?,),
    `X` is features values, ndarray of shape (?, n), `y` is target values, ndarray of shape (?,)
    :rtype: tuple[ndarray, ndarray, ndarray] | tuple[ndarray, ndarray]
    '''
    
    # Copy the dataset
    titanic = data.copy()

    # Create columns of passenger initials (honorifics)
    titanic['Initial'] = titanic['Name'].str.extract('(\w+)\.')
    titanic['Initial'] = titanic['Initial'].replace(
        ['Mlle', 'Mme', 'Ms'] +                  # Replace to Miss
        ['Lady', 'Countess', 'Dona'] +           # Replace to Mrs
        ['Dr', 'Major', 'Capt', 'Sir', 'Don'] +  # Replace to Mr
        ['Jonkheer', 'Col', 'Rev'],              # Replace to Other
        ['Miss'] * 3 + ['Mrs'] * 3 + ['Mr'] * 5 + ['Other'] * 3
    )

    # Fill in NA values in Age column with age mean values for different passenger initials
    age_means = titanic.groupby('Initial')['Age'].mean().round(1)
    age_means -= age_means % 0.5
    titanic.loc[data['Age'].isna(), 'Age'] = titanic.loc[data['Age'].isna(), 'Initial'].map(age_means.to_dict())

    # Fill in NA values in Fare column with fare average value
    fare_mean = titanic['Fare'].mean()
    titanic['Fare'] = titanic['Fare'].fillna(fare_mean)

    # Fill in NA values in Embarked column with embarkation mode value
    embarkation_mode = titanic['Embarked'].value_counts().index[0]
    titanic['Embarked'] = titanic['Embarked'].fillna(embarkation_mode)

    # Convert values in Sex column to numeric
    titanic['Sex'] = titanic['Sex'].map({'male': 0, 'female': 1})

    # Create dummy features which represent unique values in Embarked column
    embarkations = pd.get_dummies(titanic['Embarked'], prefix='Embarked', dtype='int')
    titanic = pd.concat([titanic, embarkations], axis=1)

    # Name of passenger id column
    id_feature = 'PassengerId'

    # Names of feature columns and target column
    features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare'] + embarkations.columns.to_list()
    target = 'Survived'

    # Return ids values, feature values and target values
    if return_y:
        if target not in titanic.columns:
            raise TypeError(f'Dataset does not contain a {target} column')
        return titanic[id_feature].values, titanic[features].values, titanic[target].values
    
    # Return only ids values and feature values
    else:
        return titanic[id_feature].values, titanic[features].values

### ML Models Building

In [None]:
# Clean train dataset
_, X_train, y_train = clean_titanic_data(train_data, return_y=True)
print(f'Shape of features: {X_train.shape}')
print(f'Shape of targets: {y_train.shape}')

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Normalize dataset using min-max scaling
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV

#### K-Nearest Neighbors (KNeighborsClassifier)

In [None]:
# 
clf = KNeighborsClassifier()
params = {'n_neighbors': list(range(3, 20))}
grid_clf = GridSearchCV(clf, params).fit(X_train_scaled, y_train)

print('Best parameters (KNeighborsClassifier):', grid_clf.best_params_)
print('Best score (accuracy):', grid_clf.best_score_)

#### Logistic Regression (LogisticRegression)

In [None]:
# 
clf = LogisticRegression()
params = {'C': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10]}
grid_clf = GridSearchCV(clf, params).fit(X_train_scaled, y_train)

print('Best parameters (LogisticRegression):', grid_clf.best_params_)
print('Best score (accuracy):', grid_clf.best_score_)

#### Support Vector Machine (SVC)

In [None]:
# 
clf = SVC()
params = [
    {'kernel': 'linear', 'C': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10]},
    {'kernel': 'poly', 'C': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10], 'degree': [2, 3, 4, 5]},
    {'kernel': 'rbf', 'C': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10], 
     'gamma': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 10, 50, 100]}
]
grid_clf = GridSearchCV(clf, params).fit(X_train_scaled, y_train)

print('Best parameters (SVC):', grid_clf.best_params_)
print('Best score (accuracy):', grid_clf.best_score_)

In [80]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
grid_clf = GridSearchCV(clf, {'C': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10]}, return_train_score=True)

grid_clf.fit(X_train_scaled, y_train)
print('Grid best parameter (max accuracy):', grid_clf.best_params_)
print('Grid best score (accuracy):', grid_clf.best_score_)

Grid best parameter (max accuracy): {'C': 5}
Grid best score (accuracy): 0.8099203231960498


In [79]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier()
grid_clf = GridSearchCV(clf, {'n_neighbors': [3, 5, 7, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29]}, return_train_score=True)

grid_clf.fit(X_train_scaled, y_train)
print('Grid best parameter (max accuracy):', grid_clf.best_params_)
print('Grid best score (accuracy):', grid_clf.best_score_)

Grid best parameter (max accuracy): {'n_neighbors': 3}
Grid best score (accuracy): 0.7979014700931433


In [81]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

clf = SVC(kernel='rbf')
grid_clf = GridSearchCV(
    clf, 
    {'C': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10], 'gamma': [0.001, 0.01, 0.05, 0.1, 1, 10, 100]},
    return_train_score=True
)

grid_clf.fit(X_train_scaled, y_train)
print('Grid best parameter (max accuracy):', grid_clf.best_params_)
print('Grid best score (accuracy):', grid_clf.best_score_)

Grid best parameter (max accuracy): {'C': 5, 'gamma': 10}
Grid best score (accuracy): 0.8129278419930424


In [82]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(random_state=2023)
grid_clf = GridSearchCV(
    clf, 
    {'criterion': ['gini', 'entropy'], 'max_depth': [2, 5, 7, 10, 20, 30, 35, 40, 50, 100]},
    return_train_score=True
)

grid_clf.fit(X_train_scaled, y_train)
print('Grid best parameter (max accuracy):', grid_clf.best_params_)
print('Grid best score (accuracy):', grid_clf.best_score_)

Grid best parameter (max accuracy): {'criterion': 'gini', 'max_depth': 5}
Grid best score (accuracy): 0.817383009763214


In [83]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=2023)
grid_clf = GridSearchCV(
    clf, 
    {'criterion': ['gini', 'entropy', 'log_loss'], 'n_estimators': [20, 50, 100, 150, 200, 250, 300]},
    return_train_score=True
)

grid_clf.fit(X_train_scaled, y_train)
print('Grid best parameter (max accuracy):', grid_clf.best_params_)
print('Grid best score (accuracy):', grid_clf.best_score_)

Grid best parameter (max accuracy): {'criterion': 'gini', 'n_estimators': 300}
Grid best score (accuracy): 0.8114689709347997


In [84]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(random_state=2023)
grid_clf = GridSearchCV(
    clf, 
    {'learning_rate': [0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10], 'n_estimators': [20, 50, 100, 150, 200, 250, 300]},
    return_train_score=True
)

grid_clf.fit(X_train_scaled, y_train)
print('Grid best parameter (max accuracy):', grid_clf.best_params_)
print('Grid best score (accuracy):', grid_clf.best_score_)

Grid best parameter (max accuracy): {'learning_rate': 0.05, 'n_estimators': 200}
Grid best score (accuracy): 0.8368308831780945


In [85]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

clf = XGBClassifier(random_state=2023)
grid_clf = GridSearchCV(
    clf, 
    {'max_depth': [2, 5, 7, 10, 20, 30, 35, 40, 50, 100], 'learning_rate': [0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10],
     'n_estimators': [20, 50, 100, 150, 200, 250, 300], 'reg_lambda': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10]},
    return_train_score=True
)

grid_clf.fit(X_train_scaled, y_train)
print('Grid best parameter (max accuracy):', grid_clf.best_params_)
print('Grid best score (accuracy):', grid_clf.best_score_)

KeyboardInterrupt: 