In [1]:
import numpy as np
import pandas as pd
from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

df = pd.read_csv('titanic.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [2]:
def split(df, stratify_by=None):
    """
    Crude train, validate, test split
    To stratify, send in a column name for the stratify_by argument
    """

    if stratify_by == None:
        train, test = train_test_split(df, test_size=.2, random_state=123)
        train, validate = train_test_split(train, test_size=.3, random_state=123)
    else:
        train, test = train_test_split(df, test_size=.2, random_state=123, stratify=df[stratify_by])
        train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train[stratify_by])

    return train, validate, test

# Planning

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [4]:
df.shape

(891, 14)

In [5]:
df["is_female"] = df.sex == "female"

df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone,is_female
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0,False
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0,True
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1,True
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0,True
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1,False


In [6]:
df = pd.concat([df, (pd.get_dummies(df[["class"]], drop_first=True))], axis=1)

df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone,is_female,class_Second,class_Third
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0,False,0,1
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0,True,0,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1,True,0,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0,True,0,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1,False,0,1


In [7]:
df = pd.concat([df, (pd.get_dummies(df[["embark_town"]], drop_first=True))], axis=1)

df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone,is_female,class_Second,class_Third,embark_town_Queenstown,embark_town_Southampton
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0,False,0,1,0,1
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0,True,0,0,0,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1,True,0,1,0,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0,True,0,0,0,1
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1,False,0,1,0,1


In [8]:
df = df.drop(columns=["Unnamed: 0", 'pclass', 'sex', 'embarked', 'class', 'deck', 'embark_town',])
df.head()

Unnamed: 0,passenger_id,survived,age,sibsp,parch,fare,alone,is_female,class_Second,class_Third,embark_town_Queenstown,embark_town_Southampton
0,0,0,22.0,1,0,7.25,0,False,0,1,0,1
1,1,1,38.0,1,0,71.2833,0,True,0,0,0,0
2,2,1,26.0,0,0,7.925,1,True,0,1,0,1
3,3,1,35.0,1,0,53.1,0,True,0,0,0,1
4,4,0,35.0,0,0,8.05,1,False,0,1,0,1


In [9]:
# Split the data
# stratifying means we're making representative datasets between train, validate, test

train, validate, test = split(df, stratify_by="survived")
train.head()

Unnamed: 0,passenger_id,survived,age,sibsp,parch,fare,alone,is_female,class_Second,class_Third,embark_town_Queenstown,embark_town_Southampton
583,583,0,36.0,0,0,40.125,1,False,0,0,0,0
165,165,1,9.0,0,2,20.525,0,False,0,1,0,1
50,50,0,7.0,4,1,39.6875,0,False,0,1,0,1
259,259,1,50.0,0,1,26.0,0,True,1,0,0,1
306,306,1,,0,0,110.8833,1,True,0,0,0,0


In [11]:
# Setup our X inputs and y target variable for each split
X_train = train.drop(columns=['survived'])
y_train = train.survived # labeled data == supervise algorithm

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [12]:
train.head()

Unnamed: 0,passenger_id,survived,age,sibsp,parch,fare,alone,is_female,class_Second,class_Third,embark_town_Queenstown,embark_town_Southampton
583,583,0,36.0,0,0,40.125,1,False,0,0,0,0
165,165,1,9.0,0,2,20.525,0,False,0,1,0,1
50,50,0,7.0,4,1,39.6875,0,False,0,1,0,1
259,259,1,50.0,0,1,26.0,0,True,1,0,0,1
306,306,1,,0,0,110.8833,1,True,0,0,0,0


In [13]:
# Let's generate a blank, new Decision Tree model
# Be sure to set the max_depth argument
# clf = DecisionTreeClassifier(max_depth=3, random_state=123)

clf = DecisionTreeClassifier(max_depth=3, random_state=319)

In [14]:
# Now let's train our model on the training data
# fitting == training the model
clf = clf.fit(X_train, y_train)
clf

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').