# Imports

In [1]:
import seaborn as sns
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.ensemble import RandomForestClassifier

import plotly.graph_objects as go
import plotly.express as px

import dash
from dash import html, dcc
import dash_daq as daq
from dash.dependencies import Input, Output, State

# Data Preparation

In [2]:
# Load data
df = sns.load_dataset('titanic').drop(columns=['pclass', 'embarked', 'alive'])
df

Unnamed: 0,survived,sex,age,sibsp,parch,fare,class,who,adult_male,deck,embark_town,alone
0,0,male,22.0,1,0,7.2500,Third,man,True,,Southampton,False
1,1,female,38.0,1,0,71.2833,First,woman,False,C,Cherbourg,False
2,1,female,26.0,0,0,7.9250,Third,woman,False,,Southampton,True
3,1,female,35.0,1,0,53.1000,First,woman,False,C,Southampton,False
4,0,male,35.0,0,0,8.0500,Third,man,True,,Southampton,True
...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,male,27.0,0,0,13.0000,Second,man,True,,Southampton,True
887,1,female,19.0,0,0,30.0000,First,woman,False,B,Southampton,True
888,0,female,,1,2,23.4500,Third,woman,False,,Southampton,False
889,1,male,26.0,0,0,30.0000,First,man,True,C,Cherbourg,True


In [3]:
# Format data for dashboard
df.columns = df.columns.str.capitalize().str.replace('_', ' ')
df.rename(columns={'Sex': 'Gender'}, inplace=True)
for col in df.select_dtypes('object').columns:
    df[col] = df[col].str.capitalize()
df

Unnamed: 0,Survived,Gender,Age,Sibsp,Parch,Fare,Class,Who,Adult male,Deck,Embark town,Alone
0,0,Male,22.0,1,0,7.2500,Third,Man,True,,Southampton,False
1,1,Female,38.0,1,0,71.2833,First,Woman,False,C,Cherbourg,False
2,1,Female,26.0,0,0,7.9250,Third,Woman,False,,Southampton,True
3,1,Female,35.0,1,0,53.1000,First,Woman,False,C,Southampton,False
4,0,Male,35.0,0,0,8.0500,Third,Man,True,,Southampton,True
...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,Male,27.0,0,0,13.0000,Second,Man,True,,Southampton,True
887,1,Female,19.0,0,0,30.0000,First,Woman,False,B,Southampton,True
888,0,Female,,1,2,23.4500,Third,Woman,False,,Southampton,False
889,1,Male,26.0,0,0,30.0000,First,Man,True,C,Cherbourg,True


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   Survived     891 non-null    int64   
 1   Gender       891 non-null    object  
 2   Age          714 non-null    float64 
 3   Sibsp        891 non-null    int64   
 4   Parch        891 non-null    int64   
 5   Fare         891 non-null    float64 
 6   Class        891 non-null    category
 7   Who          891 non-null    object  
 8   Adult male   891 non-null    bool    
 9   Deck         203 non-null    category
 10  Embark town  889 non-null    object  
 11  Alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(3), object(3)
memory usage: 59.8+ KB


In [5]:
# Partition into train and test splits
TARGET = 'Survived'
y = df[TARGET]
X = df.drop(columns=TARGET)

numerical = X.select_dtypes(include=['number', 'boolean']).columns
categorical = X.select_dtypes(exclude=['number', 'boolean']).columns
X[categorical] = X[categorical].astype('object')

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=.25, random_state=42, stratify=y
)
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

In [6]:
# # Build pipeline (Original from Medium)
# pipeline = Pipeline([
#     ('preprocessor', ColumnTransformer(transformers=[
#         ('cat', Pipeline([
#             ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
#             ('encoder', OneHotEncoder(sparse=False))
            
#         ]), categorical),
#         ('num', SimpleImputer(strategy='mean'), numerical)
#     ])),
#     ('model', RandomForestClassifier(random_state=42))
# ])
# pipeline.fit(X_train, y_train)

In [7]:
# Build pipeline
ct = make_column_transformer(
    (
        make_pipeline(
            SimpleImputer(strategy='constant', fill_value='Missing'),
            OneHotEncoder(sparse=False)
        ),
        categorical
    ),
    (
        SimpleImputer(strategy='mean'),
        numerical
    )
)

pipeline = make_pipeline(
    ct, RandomForestClassifier(random_state=42)
)
pipeline.fit(X_train, y_train)

In [8]:
# Add predicted probabilities
test['Probability'] = pipeline.predict_proba(X_test)[:,1]
test['Target'] = test[TARGET]
test[TARGET] = test[TARGET].map({0: 'No', 1: 'Yes'})

labels = []
for i, x in enumerate(np.arange(0, 101, 10)):
    if i > 0:
        labels.append(f"{previous_x}% to <{x}%")
    previous_x = x
test['Binned probability'] = pd.cut(test['Probability'], 
                                    len(labels), 
                                    labels=labels, 
                                    right=False)

In [9]:
test

Unnamed: 0,Gender,Age,Sibsp,Parch,Fare,Class,Who,Adult male,Deck,Embark town,Alone,Survived,Probability,Target,Binned probability
157,Male,30.0,0,0,8.0500,Third,Man,True,,Southampton,True,No,0.105000,0,10% to <20%
501,Female,21.0,0,0,7.7500,Third,Woman,False,,Queenstown,True,No,0.944877,0,90% to <100%
352,Male,15.0,1,1,7.2292,Third,Child,False,,Cherbourg,False,No,0.730000,0,70% to <80%
82,Female,,0,0,7.7875,Third,Woman,False,,Queenstown,True,Yes,0.818212,1,80% to <90%
683,Male,14.0,5,2,46.9000,Third,Child,False,,Southampton,False,No,0.060000,0,0% to <10%
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259,Female,50.0,0,1,26.0000,Second,Woman,False,,Southampton,False,Yes,0.780000,1,70% to <80%
91,Male,20.0,0,0,7.8542,Third,Man,True,,Southampton,True,No,0.000000,0,0% to <10%
341,Female,24.0,3,2,263.0000,First,Woman,False,C,Southampton,False,Yes,0.550000,1,50% to <60%
115,Male,21.0,0,0,7.9250,Third,Man,True,,Southampton,True,No,0.073333,0,0% to <10%


In [10]:
# Helper functions for dropdowns and slider
def create_dropdown_options(series):
    options = [{'label': i, 'value': i} for i in series.sort_values().unique()]
    return options

def create_dropdown_value(series):
    value = series.sort_values().unique().tolist()
    return value

def create_slider_marks(values):
    marks = {i: {'label': str(i)} for i in values}
    return marks