In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from sklearn import (
    ensemble,
    preprocessing,
    tree,
)
from sklearn.metrics import (
    auc,
    confusion_matrix,
    roc_auc_score,
    roc_curve,
)
from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
)
from yellowbrick.classifier import (
    ConfusionMatrix,
    ROCAUC,
)
from yellowbrick.model_selection import (
    LearningCurve
)

url = ("http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.xls")
df = pd.read_excel(url)
orig_df = df

In [2]:
df.dtypes

pclass         int64
survived       int64
name          object
sex           object
age          float64
sibsp          int64
parch          int64
ticket        object
fare         float64
cabin         object
embarked      object
boat          object
body         float64
home.dest     object
dtype: object

In [4]:
# Manually calculate pearson
numerator = np.cov(df[["survived", "pclass"]].T)[0][1]

stddevs = df[["survived", "pclass"]].std()
denominator = stddevs[0] * stddevs[1]

print(numerator/denominator)

-0.3124693626496759


In [5]:
df.describe().iloc[:, :2]

Unnamed: 0,pclass,survived
count,1309.0,1309.0
mean,2.294882,0.381971
std,0.837836,0.486055
min,1.0,0.0
25%,2.0,0.0
50%,3.0,0.0
75%,3.0,1.0
max,3.0,1.0


In [6]:
# Rows with nulls
mask = df.isnull().any(axis=1)
#mask.head()

df[mask].body.head()

0      NaN
1      NaN
2      NaN
3    135.0
4      NaN
Name: body, dtype: float64

In [7]:
df.sex.value_counts(dropna=False)
df.embarked.value_counts(dropna=False)

S      914
C      270
Q      123
NaN      2
Name: embarked, dtype: int64

In [8]:
# Drop unhelpful or leaky variables

df = df.drop(columns=[
    "name",
    "ticket",
    "home.dest", 
    "boat",
    "body",
    "cabin",
])

df.columns

Index(['pclass', 'survived', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked'],
      dtype='object')

In [9]:
# Create dummy variables for sex and embarked

df = pd.get_dummies(df)
df.columns

Index(['pclass', 'survived', 'age', 'sibsp', 'parch', 'fare', 'sex_female',
       'sex_male', 'embarked_C', 'embarked_Q', 'embarked_S'],
      dtype='object')

In [10]:
# Remove multicorrelated
df = df.drop(columns="sex_male")

In [11]:
# Ready to go, set x and y
y = df.survived
X = df.drop(columns="survived")


## Split

In [12]:
# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [13]:
X_train.describe()

Unnamed: 0,pclass,age,sibsp,parch,fare,sex_female,embarked_C,embarked_Q,embarked_S
count,916.0,729.0,916.0,916.0,915.0,916.0,916.0,916.0,916.0
mean,2.305677,29.102309,0.539301,0.386463,33.709221,0.353712,0.204148,0.098253,0.696507
std,0.841811,13.866954,1.082188,0.893933,52.840656,0.478382,0.403298,0.297819,0.460018
min,1.0,0.1667,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,21.0,0.0,0.0,7.925,0.0,0.0,0.0,0.0
50%,3.0,28.0,0.0,0.0,14.5,0.0,0.0,0.0,1.0
75%,3.0,37.0,1.0,0.0,31.275,1.0,0.0,0.0,1.0
max,3.0,80.0,8.0,9.0,512.3292,1.0,1.0,1.0,1.0


In [14]:
from sklearn.experimental import enable_iterative_imputer 
from sklearn import impute

# Numeric columns
num_cols = [
    "pclass",
    "age",
    "sibsp",
    "parch",
    "fare",
    "sex_female",
]

imputer = impute.IterativeImputer()

imputed = imputer.fit_transform(X_train[num_cols])
X_train.loc[:, num_cols] = imputed

imputed = imputer.fit_transform(X_test[num_cols])
X_test.loc[:, num_cols] = imputed

#medians = X_train.median()
#X_train = X_train.fillna(medians)
#X_test = X_test.fillna(medians)


In [15]:
X_train.describe()

Unnamed: 0,pclass,age,sibsp,parch,fare,sex_female,embarked_C,embarked_Q,embarked_S
count,916.0,916.0,916.0,916.0,916.0,916.0,916.0,916.0,916.0
mean,2.305677,28.637394,0.539301,0.386463,33.66755,0.353712,0.204148,0.098253,0.696507
std,0.841811,12.832391,1.082188,0.893933,52.826831,0.478382,0.403298,0.297819,0.460018
min,1.0,0.1667,0.0,0.0,-4.461616,0.0,0.0,0.0,0.0
25%,2.0,21.321827,0.0,0.0,7.9177,0.0,0.0,0.0,0.0
50%,3.0,26.990999,0.0,0.0,14.5,0.0,0.0,0.0,1.0
75%,3.0,35.0,1.0,0.0,31.275,1.0,0.0,0.0,1.0
max,3.0,80.0,8.0,9.0,512.3292,1.0,1.0,1.0,1.0


## Standardize numeric columns

Standardize to mean 0 and standard deviation of 1

In [16]:
cols = "pclass,age,sibsp,fare".split(",")
sca = preprocessing.StandardScaler()
#pd.DataFrame(sca.fit_transform(X_train)).describe()

#X_train_tmp = sca.fit_transform(X_train.loc[:, cols])
#X_train.loc[:, cols] = pd.DataFrame(X_train_tmp, columns=cols)

#X_test_tmp = sca.fit_transform(X_test.loc[:, cols])
#X_test.loc[:, cols] = pd.DataFrame(X_test_tmp, columns=cols)


X_train_tmp = sca.fit_transform(X_train.loc[:,cols])
X_train_std = pd.DataFrame(X_train_tmp,columns=cols)
X_test_std = sca.transform(X_test.loc[:,cols])
X_test_std = pd.DataFrame(X_test_std,columns=cols)

X_train = X_train_std
X_test = X_test_std



## Base Model

In [17]:
from sklearn.dummy import DummyClassifier

bm = DummyClassifier()
bm.fit(X_train, y_train)
bm.score(X_test, y_test)

0.539440203562341

In [18]:
from sklearn import metrics

metrics.precision_score(
    y_test, bm.predict(X_test)
)

0.4178082191780822

## Cross Validation - KMeans

In [19]:
X_train.isna().sum()

pclass    0
age       0
sibsp     0
fare      0
dtype: int64

In [20]:
# Concatinate since we're doing cross-validation

X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test])



from sklearn import model_selection
from sklearn .linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost

for model in [
    DummyClassifier,
    LogisticRegression,
    DecisionTreeClassifier,
    KNeighborsClassifier,
    GaussianNB,
    SVC,
    RandomForestClassifier,
    xgboost.XGBClassifier,
]:
    cls = model()
    kfold = model_selection.KFold(n_splits=10, random_state=42)
    s = model_selection.cross_val_score(cls, X, y, scoring="roc_auc", cv=kfold)
    print(
        f"{model.__name__:22} AUC: "
        f"{s.mean():.3f} STD: {s.std():2f}"
    )

DummyClassifier        AUC: 0.500 STD: 0.026944
LogisticRegression     AUC: 0.722 STD: 0.035968
DecisionTreeClassifier AUC: 0.627 STD: 0.041445
KNeighborsClassifier   AUC: 0.718 STD: 0.025983
GaussianNB             AUC: 0.691 STD: 0.048240
SVC                    AUC: 0.695 STD: 0.021138
RandomForestClassifier AUC: 0.718 STD: 0.029907
XGBClassifier          AUC: 0.764 STD: 0.030682
