# Final Project - Titanic Dataset 
**Jason "Scott" Person**

Using the Titanic dataset from [this](https://www.kaggle.com/c/titanic/overview) Kaggle competition.

Fields include:

- **Name** (str) - Name of the passenger
- **Pclass** (int) - Ticket class
- **Sex** (str) - Sex of the passenger
- **Age** (float) - Age in years
- **SibSp** (int) - Number of siblings and spouses aboard
- **Parch** (int) - Number of parents and children aboard
- **Ticket** (str) - Ticket number
- **Fare** (float) - Ticket price paid
- **Cabin** (str) - Cabin number
- **Embarked** (str) - Port of embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)

In [0]:
# import libraries required to load, transform, analyze and plot data
# this is from the churn analysis notebook, which is the foundation for this project solution

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(context='paper', style='darkgrid', 
        rc={'figure.facecolor':'white'}, font_scale=1.2)

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import make_scorer, precision_recall_curve, classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import roc_curve, auc, f1_score, roc_auc_score
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.dummy import DummyClassifier
import statsmodels.api as sm

import warnings
warnings.filterwarnings('ignore')

In [0]:
# Customize seaborn plot styles
# Seaborn docs: https://seaborn.pydata.org/tutorial/aesthetics.html

# Adjust to retina quality
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats("retina")

# Adjust dpi and font size
sns.set(rc={"figure.dpi":100, 'savefig.dpi':300})
sns.set_context('notebook', font_scale = 0.8)

# Display tick marks
sns.set_style('ticks')

# Remove borders
plt.rc('axes.spines', top=False, right=False, left=False, bottom=False)

In [0]:
# Color palettes for plots
# Named colors: https://matplotlib.org/stable/gallery/color/named_colors.html
# Seaborn color palette docs: https://seaborn.pydata.org/tutorial/color_palettes.html
# Seaborn palette chart: https://www.codecademy.com/article/seaborn-design-ii

# cp1 Color Palette - a binary blue/orange palette
blue = 'deepskyblue' # Use 'skyblue' for a lighter blue
orange = 'orange'
cp1 = [blue, orange]

# cp2 Color Palette - 5 colors for use with categorical data
turquoise = 'mediumaquamarine'
salmon = 'darksalmon'
tan = 'tan'
gray = 'darkgray'
cp2 = [blue, turquoise, salmon, tan, gray]

# cp3 Color Palette - blue-to-orange diverging palette for correlation heatmaps
cp3 = sns.diverging_palette(242, 39, s=100, l=65, n=11)

# Set the default palette
sns.set_palette(cp1)

In [0]:
df = pd.read_csv('titanic.csv')
df.head(10)

In [0]:
# View dataframe fundamentals
df.info()

# Explore Categorical Features

In [0]:
# check value counts by column
col_list = ['Pclass', 'Sex', 'Fare', 'Embarked']

for col in col_list:
     print(f'\nValue Counts | column = {col}')
     print(df[col].value_counts(normalize=True, dropna=False))

# Calculate required data

## Median age by Sex and Pclass

In [0]:
def calculate_median_age(X_df):
    """Calculates the median age for each Sex and Pclass group.

    Parameters:
    X_df (pd.DataFrame)): train or test slice contains predictors

    Returns:
    median_ages_df (pd.DataFrame)): dataframe with median ages for each Sex and Pclass group
    """
    median_ages = X_df.groupby(['Sex', 'Pclass'])['Age'].median().reset_index()
    median_ages_df = median_ages.rename(columns={'Age': 'Median_Age'})
    return median_ages_df

In [0]:
def process_and_persist_median_ages(X_df):
    """Calculates and persists the median age dataframe to storage.

    Parameters:
    df (pd.DataFrame): Dataframe containing the data

    Returns:
    None
    """
    median_ages_df = calculate_median_age(X_df)
    median_ages_df.to_csv('median_ages.csv', index=False)

## Median third class fare

In [0]:
def calculate_class_median_fare(X_df):
    """Fill missing fare values in the Fare field. There is one missing value and it is a third class passenger so we're going to use the median fare for that class.

    Parameters:
    X_df (pd.DataFrame)): train or test slice contains predictors

    Returns:
    median_fare_df (pd.DataFrame): dataframe with the median fare for third class passengers
    """

    med_fare = X_df.groupby(['Pclass']).Fare.median().reset_index()
    median_fare_df = med_fare.rename(columns={'Fare': 'Median_Fare'})

    return median_fare_df

In [0]:
def process_and_persist_fare(X_df):
    """Calls calculate_class_median_fare function to fill missing fare values and persists the median fare to a CSV file.

    Parameters:
    X_df (pd.DataFrame): Dataframe containing the data

    Returns:
    X_df (pd.DataFrame): Dataframe with filled fare values
    """
    median_class_fares = calculate_class_median_fare(X_df)
    median_class_fares.to_csv('median_class_fares.csv', index=False)

## Splitter persister

In [0]:
def calculate_bins(X_df, column, bin_count):
    """Calculates the bins for the specified column using qcut and returns the splits in a dataframe.

    Parameters:
    X_df (pd.DataFrame): Dataframe containing the data
    column (str): The column to calculate bins for
    bins (int): The number of bins to split the data into

    Returns:
    bins_df (pd.DataFrame): Dataframe with the bin edges
    """
    bin_edges = pd.qcut(X_df[column], q=bin_count, retbins=True)[1]
    bins_df = pd.DataFrame({'Bin_Edges': bin_edges})
    
    return bins_df

In [0]:
def calculate_and_persist_fare_bins(X_df):
    """Calls calculate_bins on the Fare column with x bins and persists the returned dataframe to fare_splist.csv.

    Parameters:
    X_df (pd.DataFrame): Dataframe containing the data

    Returns:
    bins_df (pd.DataFrame): Dataframe with the bin edges
    """
    bin_count = 5
    bins_df = calculate_bins(X_df, 'Fare', bin_count)
    bins_df.to_csv('fare_splits.csv', index=False)



# Data Preparation

## Functions

### fill_age

In [0]:
# fill age with median from group
def fill_age(X_df):
    """Fills missing age values in the age field using the provided age dataframe.

    Parameters:
    X_df (pd.DataFrame)): train or test slice contains predictors

    Returns:
    X_df (pd.DataFrame)): same dataframe with replaced values
    """
    
    # Load ages from storage - this would go in production pipeline
    median_ages_df = pd.read_csv('median_ages.csv')

    # Create a dictionary for median ages from age_df
    median_ages = median_ages_df.set_index(['Sex', 'Pclass'])['Median_Age'].to_dict()
    
    # Setting Age to the median value based on Sex and Pclass only when Age is not a number
    X_df['age'] = X_df.apply(lambda row: median_ages.get((row['sex'], row['pclass'])) if pd.isnull(row['age']) else row['age'], axis=1)
    
    return X_df

### fill_embarked

In [0]:
# fill embarked with 'S'

def fill_embarked(X_df):
    """Fill missing embarded values in the Embarked field. We use S based on analysis by Evitan that showed that these two passengers actually embarked at Southampton.

    Parameters:
    X_df (pd.DataFrame)): train or test slice contains predictors

    Returns:
    X_df (pd.DataFrame)): same dataframe with replaced values
    """

    # Filling the missing values in Embarked with S
    X_df['embarked'] = X_df['embarked'].fillna('S')
    
    return X_df

### fill_fare

In [0]:
def fill_fare(X_df):
    """Fill missing fare values in the Fare field using the median fare for the same class.

    Parameters:
    X_df (pd.DataFrame)): train or test slice contains predictors

    Returns:
    X_df (pd.DataFrame)): same dataframe with replaced values
    """

    # Load median fares from storage - this would go in production pipeline
    median_class_fares_df = pd.read_csv('median_class_fares.csv')

    # Create a dictionary for median fares from median_class_fares_df
    median_class_fares = median_class_fares_df.set_index('Pclass')['Median_Fare'].to_dict()
    
    # Filling the missing value in Fare with the median fare for the same class
    X_df['fare'] = X_df.apply(lambda row: median_class_fares.get(row['pclass']) if pd.isna(row['fare']) else row['fare'], axis=1)

    return X_df

### bin_fare_age

In [0]:
def bin_age(X_df):
    """Creates calculated fields Family_count and groups it then bins fare and age

    Parameters:
    X_df (pd.DataFrame)): train or test slice contains predictors

    Returns:
    X_df (pd.DataFrame)): same dataframe with new columns
    """

    # Define age edges
    age_edges = [0, 12, 17, 30, 50, 100]
    age_labels = ['Child', 'Teen', 'YoungAdult', 'Adult', 'Senior']

    # Bin Age using predefined edges
    X_df['age_bin'] = pd.cut(X_df['age'], bins=age_edges, labels=age_labels)

    return X_df

In [0]:
def bin_fare(X_df):
    """Bins fare using predefined edges from fare_edges.csv.

    Parameters:
    X_df (pd.DataFrame)): train or test slice contains predictors

    Returns:
    X_df (pd.DataFrame)): same dataframe with new columns
    """

    # Load fare edges from fare_edges.csv
    fare_edges_df = pd.read_csv('fare_splits.csv')
    fare_edges = fare_edges_df['Bin_Edges'].tolist()

    # Bin fare using predefined edges
    X_df['fare_bin'] = pd.cut(X_df['fare'], bins=fare_edges)

    return X_df

### bin_family_count


In [0]:
def bin_family_count(X_df):
    """Creates calculated fields Family_count and groups it

    Parameters:
    X_df (pd.DataFrame)): train or test slice contains predictors

    Returns:
    X_df (pd.DataFrame)): same dataframe with new columns
    """
    # Family count
    X_df['family_count'] = X_df['sibsp'] + X_df['parch']

    # Bin family size
    family_map = {0: 'Alone', 1: 'Small', 2: 'Small', 3: 'Small', 4: 'Medium', 5: 'Medium', 6: 'Large', 7: 'Large', 10: 'Large'}
    X_df['family_count_bin'] = X_df['family_count'].map(family_map)

    return X_df

### create_title_feature

In [0]:
def create_title_feature(X_df):
    """Creates title feature and groups it; interleave the Is_married feature as well

    Parameters:
    X_df (pd.DataFrame)): train or test slice contains predictors

    Returns:
    X_df (pd.DataFrame)): same dataframe with new columns
    """    

    X_df['title'] = X_df['name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]

    X_df['is_married'] = 0
    X_df['is_married'].loc[X_df['title'] == 'Mrs'] = 1

    X_df['title'] = X_df['title'].replace(['Miss', 'Mrs','Ms', 'Mlle', 'Lady', 'Mme', 'the Countess', 'Dona'], 'Miss/Mrs/Ms')
    X_df['title'] = X_df['title'].replace(['Dr', 'Col', 'Major', 'Jonkheer', 'Capt', 'Sir', 'Don', 'Rev'], 'Dr/Military/Noble/Clergy')

    return X_df

### create_deck_feature

In [0]:
def create_deck_feature(X_df):
    """Creates deck feature

    Parameters:
    X_df (pd.DataFrame)): train or test slice contains predictors

    Returns:
    X_df (pd.DataFrame)): same dataframe with new columns
    """    

    X_df['deck'] = X_df['cabin'].apply(lambda s: s[0] if pd.notnull(s) else 'M')

    return X_df

### one_hot_encode_categories

In [0]:
# convert categorical columns to one-hot encoding features
def ohe_categories(X_df):
    """Creates one-hot encoded (OHE) features for a list of categorical columns 
    and simplifies column names.

    Parameters:
    X_df (pd.DataFrame)): train or test slice contains predictors

    Returns:
    X_df (pd.DataFrame)): same dataframe with OHE columns
    """

    # create list of multi-class variables for one-hot encoding
    categoricals = ['pclass', 'sex', 'embarked', 'title', 'deck', 'family_count_bin', 'fare_bin','age_bin']

    # Without this line, I was just getting another Pclass column. Took about an hour to figure out. This feels really kludgy.
    X_df['pclass'] = X_df['pclass'].astype(str)

    # create one-hot encoded dummy variables for categoricals
    # leaving this in so that I have an example in the future
    X_df_ohe = pd.get_dummies(X_df[categoricals], drop_first=False, dtype=int)
    X_df_ohe.rename(
        columns={'sex_male' : 'sex_male',
                 'sex_female' : 'sex_female'
                }, inplace = True)
    
    # concatenate OHE with original df, and drop original category columns
    X_df = pd.concat([X_df, X_df_ohe], axis=1)
    X_df.drop(categoricals, axis=1, inplace=True)
    
    return X_df

### lower_case_column_names

In [0]:
def rename_columns_lowercase(X_df):
    """Renames all column names to lowercase in place.

    Parameters:
    X_df (pd.DataFrame): DataFrame whose columns need to be renamed

    Returns:
    None
    """
    X_df.columns = [col.lower() for col in X_df.columns]

    return X_df

### Drop Unneeded Columns

In [0]:
def drop_columns(X_df, columns_to_drop):
    """Drops specified columns from the dataframe.

    Parameters:
    X_df (pd.DataFrame): DataFrame from which columns need to be dropped
    columns_to_drop (list): List of column names to drop

    Returns:
    X_df (pd.DataFrame): DataFrame with specified columns dropped
    """
    X_df.drop(columns=columns_to_drop, axis=1, inplace=True)
    return X_df

In [0]:
def drop_specified_columns(X_df):
    """Drops specified columns from the dataframe.

    Parameters:
    X_df (pd.DataFrame): DataFrame from which columns need to be dropped

    Returns:
    X_df (pd.DataFrame): DataFrame with specified columns dropped
    """
    columns_to_drop = ['passengerid', 'name', 'age', 'sibsp', 'parch', 'ticket', 'fare', 'cabin', 'family_count', 'is_married']
    return drop_columns(X_df, columns_to_drop)

## Data Prep Pipeline

In [0]:
# function holds data preparation pipeline for X predictors dataframe
def data_prep_pipe(X_df):
    """Executes data preparation pipeline of steps to clean and transform
    an X features dataframe.

    Parameters:
    X_df (pd.DataFrame)): train or test slice contains predictors

    Returns:
    X_df_tr (pd.DataFrame)): train or test dataframe, transformed
    """
    
    # instantiate custom transformer functions
    get_fill_age = FunctionTransformer(fill_age, validate=False)
    get_fill_embarked = FunctionTransformer(fill_embarked, validate=False)
    get_fill_fare = FunctionTransformer(fill_fare, validate=False)
    get_bin_age = FunctionTransformer(bin_age, validate=False)
    get_bin_fare = FunctionTransformer(bin_fare, validate=False)
    get_bin_family_count = FunctionTransformer(bin_family_count, validate=False)
    get_create_title_feature = FunctionTransformer(create_title_feature, validate=False)
    get_create_deck_feature = FunctionTransformer(create_deck_feature, validate=False)
    get_ohe_categories = FunctionTransformer(ohe_categories, validate=False)
    get_rename_columns_lowercase = FunctionTransformer(rename_columns_lowercase, validate=False)
    get_drop_specified_columns = FunctionTransformer(drop_specified_columns, validate=False)

    # instantiate data prep pipeline object and steps
    prep_pipe = Pipeline(memory=None, 
                         steps=[('rename_columns_lowercase', get_rename_columns_lowercase),
                                ('fill_age', get_fill_age),
                                ('fill_embarked', get_fill_embarked),
                                ('fill_fare', get_fill_fare),
                                ('bin_age', get_bin_age),
                                ('bin_fare', get_bin_fare),
                                ('bin_family_count', get_bin_family_count),
                                ('create_title_feature', get_create_title_feature),
                                ('create_deck_feature', get_create_deck_feature),
                                ('ohe_categories', get_ohe_categories),
                                ('drop_specified_columns', get_drop_specified_columns),
                                ('rename_columns_lowercase_again', get_rename_columns_lowercase)
                                ])
    
    # apply data prep pipeline to df and store/return new df
    X_df_tr = prep_pipe.fit_transform(X_df)
    return X_df_tr

# Run Pipeline

## Train Test Split

In [0]:
# Create X predictors and y target variable
y = df['Survived']
X = df.drop(columns=['Survived'], axis=1)

# Split into training and test sets
SEED = 42

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=SEED)

## Calculate Persisted Data Prep Values

In [0]:
# Calculate and persist the median ages and fares to fill in the missing data. This is done against the training set only to prevent leakage.
# Eventually I would refactor this to use the pipeline - just don't have time right now!
# Note: Evitan used the union of training and test data to calculate averages. I believe that this may be an error. https://www.kaggle.com/code/gunesevitan/titanic-advanced-feature-engineering-tutorial block In [7]

process_and_persist_median_ages(X_train)
process_and_persist_fare(X_train)
calculate_and_persist_fare_bins(X_train)

## Prep Data

In [0]:
# send both X_train and X_test through data prep steps
X_train = data_prep_pipe(X_train)
X_test = data_prep_pipe(X_test)

In [0]:
df.head()

In [0]:
X_train.info()

In [0]:
X_train.head()

In [0]:
X_test.head()

In [0]:
X_train.describe()

In [0]:
df.describe()

In [0]:
df.info(
)

In [0]:
X_train.index

In [0]:
print(df['Pclass'].unique())