In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from sklearn import (
    ensemble,
    preprocessing,
    tree,
)
from sklearn.metrics import (
    auc,
    confusion_matrix,
    roc_auc_score,
    roc_curve,
)
from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
)
from yellowbrick.classifier import (
    ConfusionMatrix,
    ROCAUC,
)
from yellowbrick.model_selection import (
    LearningCurve
)

url = ("http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.xls")
df = pd.read_excel(url)
orig_df = df

In [2]:
df.dtypes

pclass         int64
survived       int64
name          object
sex           object
age          float64
sibsp          int64
parch          int64
ticket        object
fare         float64
cabin         object
embarked      object
boat          object
body         float64
home.dest     object
dtype: object

In [3]:
import pandas_profiling
pandas_profiling.ProfileReport(df)



In [4]:
# Manually calculate pearson
numerator = np.cov(df[["survived", "pclass"]].T)[0][1]

stddevs = df[["survived", "pclass"]].std()
denominator = stddevs[0] * stddevs[1]

print(numerator/denominator)

-0.3124693626496759


In [5]:
df.describe().iloc[:, :2]

Unnamed: 0,pclass,survived
count,1309.0,1309.0
mean,2.294882,0.381971
std,0.837836,0.486055
min,1.0,0.0
25%,2.0,0.0
50%,3.0,0.0
75%,3.0,1.0
max,3.0,1.0


In [6]:
# Rows with nulls
mask = df.isnull().any(axis=1)
#mask.head()

df[mask].body.head()

0      NaN
1      NaN
2      NaN
3    135.0
4      NaN
Name: body, dtype: float64

In [7]:
df.sex.value_counts(dropna=False)
df.embarked.value_counts(dropna=False)

S      914
C      270
Q      123
NaN      2
Name: embarked, dtype: int64

In [8]:
# Drop unhelpful or leaky variables

df = df.drop(columns=[
    "name",
    "ticket",
    "home.dest", 
    "boat",
    "body",
    "cabin",
])

df.columns

Index(['pclass', 'survived', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked'],
      dtype='object')

In [9]:
# Create dummy variables for sex and embarked

df = pd.get_dummies(df)
df.columns

Index(['pclass', 'survived', 'age', 'sibsp', 'parch', 'fare', 'sex_female',
       'sex_male', 'embarked_C', 'embarked_Q', 'embarked_S'],
      dtype='object')

In [10]:
# Remove multicorrelated
df = df.drop(columns="sex_male")

In [11]:
# Ready to go, set x and y
y = df.survived
X = df.drop(columns="survived")


## Split

In [12]:
# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)