In [21]:
import graphviz
from graphviz import Graph

In [1]:
import numpy as np
import pandas as pd
from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

df = pd.read_csv('titanic.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [2]:
def split(df, stratify_by=None):
    """
    Crude train, validate, test split
    To stratify, send in a column name for the stratify_by argument
    """

    if stratify_by == None:
        train, test = train_test_split(df, test_size=.2, random_state=123)
        train, validate = train_test_split(train, test_size=.3, random_state=123)
    else:
        train, test = train_test_split(df, test_size=.2, random_state=123, stratify=df[stratify_by])
        train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train[stratify_by])

    return train, validate, test

## 1. What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [27]:
df.survived.value_counts()

0    549
1    342
Name: survived, dtype: int64

In [28]:
train["baseline_prediction"] = 0
# baseline is 0 for not survived, this is the majority 

baseline_accuracy = (train.survived == train.baseline_prediction).mean()
baseline_accuracy

0.6164658634538153

# Planning

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [4]:
df.shape

(891, 14)

In [5]:
df.dtypes

Unnamed: 0        int64
passenger_id      int64
survived          int64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class            object
deck             object
embark_town      object
alone             int64
dtype: object

In [6]:
df["is_female"] = df.sex == "female"

df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone,is_female
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0,False
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0,True
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1,True
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0,True
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1,False


In [7]:
df = pd.concat([df, (pd.get_dummies(df[["class"]], drop_first=True))], axis=1)

df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone,is_female,class_Second,class_Third
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0,False,0,1
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0,True,0,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1,True,0,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0,True,0,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1,False,0,1


In [8]:
df = pd.concat([df, (pd.get_dummies(df[["embark_town"]], drop_first=True))], axis=1)

df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone,is_female,class_Second,class_Third,embark_town_Queenstown,embark_town_Southampton
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0,False,0,1,0,1
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0,True,0,0,0,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1,True,0,1,0,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0,True,0,0,0,1
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1,False,0,1,0,1


In [9]:
df = df.drop(columns=["Unnamed: 0", 'pclass', 'sex', 'embarked', 'class', 'deck', 'embark_town',])
df.head()

Unnamed: 0,passenger_id,survived,age,sibsp,parch,fare,alone,is_female,class_Second,class_Third,embark_town_Queenstown,embark_town_Southampton
0,0,0,22.0,1,0,7.25,0,False,0,1,0,1
1,1,1,38.0,1,0,71.2833,0,True,0,0,0,0
2,2,1,26.0,0,0,7.925,1,True,0,1,0,1
3,3,1,35.0,1,0,53.1,0,True,0,0,0,1
4,4,0,35.0,0,0,8.05,1,False,0,1,0,1


In [10]:
# Split the data
# stratifying means we're making representative datasets between train, validate, test

train, validate, test = split(df, stratify_by="survived")
train.head()

Unnamed: 0,passenger_id,survived,age,sibsp,parch,fare,alone,is_female,class_Second,class_Third,embark_town_Queenstown,embark_town_Southampton
583,583,0,36.0,0,0,40.125,1,False,0,0,0,0
165,165,1,9.0,0,2,20.525,0,False,0,1,0,1
50,50,0,7.0,4,1,39.6875,0,False,0,1,0,1
259,259,1,50.0,0,1,26.0,0,True,1,0,0,1
306,306,1,,0,0,110.8833,1,True,0,0,0,0


In [11]:
train.isna().sum()

passenger_id                0
survived                    0
age                        97
sibsp                       0
parch                       0
fare                        0
alone                       0
is_female                   0
class_Second                0
class_Third                 0
embark_town_Queenstown      0
embark_town_Southampton     0
dtype: int64

In [12]:
#Calculate our fill value using train dataset only.

avg_age = int(train.age.mean())

# Fill null values in all of our datasets using our hardcoded value.

train.age = train.age.fillna(avg_age)

validate.age = validate.age.fillna(avg_age)

test.age = test.age.fillna(avg_age)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [13]:
# Setup our X inputs and y target variable for each split
X_train = train.drop(columns=['survived'])
y_train = train.survived # labeled data == supervise algorithm

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [14]:
train.head()

Unnamed: 0,passenger_id,survived,age,sibsp,parch,fare,alone,is_female,class_Second,class_Third,embark_town_Queenstown,embark_town_Southampton
583,583,0,36.0,0,0,40.125,1,False,0,0,0,0
165,165,1,9.0,0,2,20.525,0,False,0,1,0,1
50,50,0,7.0,4,1,39.6875,0,False,0,1,0,1
259,259,1,50.0,0,1,26.0,0,True,1,0,0,1
306,306,1,29.0,0,0,110.8833,1,True,0,0,0,0


In [15]:
train.shape

(498, 12)

In [16]:
train.head()

Unnamed: 0,passenger_id,survived,age,sibsp,parch,fare,alone,is_female,class_Second,class_Third,embark_town_Queenstown,embark_town_Southampton
583,583,0,36.0,0,0,40.125,1,False,0,0,0,0
165,165,1,9.0,0,2,20.525,0,False,0,1,0,1
50,50,0,7.0,4,1,39.6875,0,False,0,1,0,1
259,259,1,50.0,0,1,26.0,0,True,1,0,0,1
306,306,1,29.0,0,0,110.8833,1,True,0,0,0,0


In [17]:
train.dtypes

passenger_id                 int64
survived                     int64
age                        float64
sibsp                        int64
parch                        int64
fare                       float64
alone                        int64
is_female                     bool
class_Second                 uint8
class_Third                  uint8
embark_town_Queenstown       uint8
embark_town_Southampton      uint8
dtype: object

## 2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [70]:
# Let's generate a blank, new Decision Tree model
# Be sure to set the max_depth argument
# clf = DecisionTreeClassifier(max_depth=3, random_state=123)

clf = DecisionTreeClassifier(max_depth=3, random_state=319)

In [71]:
# Now let's train our model on the training data
# fitting == training the model
clf = clf.fit(X_train, y_train)
clf

DecisionTreeClassifier(max_depth=3, random_state=319)

In [72]:
# Visualize the model so it can explain itself!
dot_data = export_graphviz(clf, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True, format="pdf")

'titanic_decision_tree.pdf'

In [73]:
# Now we'll make a set of predictions using this trained model
y_pred = clf.predict(X_train)
y_pred[0:3]

array([0, 0, 0])

In [74]:
# Estimate the probabilities for each class
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[0:3]

array([[0.62222222, 0.37777778],
       [0.62222222, 0.37777778],
       [0.89285714, 0.10714286]])

In [75]:
y_train.head(3)

583    0
165    1
50     0
Name: survived, dtype: int64

## 3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [29]:
confusion_matrix(y_train, y_pred)

array([[277,  30],
       [ 57, 134]])

In [30]:
labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,277,30
1,57,134


## 4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [32]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.90      0.86       307
           1       0.82      0.70      0.75       191

    accuracy                           0.83       498
   macro avg       0.82      0.80      0.81       498
weighted avg       0.82      0.83      0.82       498



## 5. Run through steps 2-4 using a different max_depth value.

### first we'll try max depth 2

In [54]:
clf = DecisionTreeClassifier(max_depth=2, random_state=319)

In [55]:
clf = clf.fit(X_train, y_train)
clf

DecisionTreeClassifier(max_depth=2, random_state=319)

In [38]:
dot_data = export_graphviz(clf, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True, format="pdf")

'titanic_decision_tree.pdf'

In [56]:
y_pred = clf.predict(X_train)
y_pred[0:3]

array([0, 0, 0])

In [57]:
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[0:3]

array([[0.68644068, 0.31355932],
       [0.68644068, 0.31355932],
       [0.68644068, 0.31355932]])

In [58]:
y_train.head(3)

583    0
165    1
50     0
Name: survived, dtype: int64

In [59]:
confusion_matrix(y_train, y_pred)

array([[265,  42],
       [ 58, 133]])

In [60]:
labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,265,42
1,58,133


In [61]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.86      0.84       307
           1       0.76      0.70      0.73       191

    accuracy                           0.80       498
   macro avg       0.79      0.78      0.78       498
weighted avg       0.80      0.80      0.80       498



### now we'll try max depth of 4

In [81]:
clf = DecisionTreeClassifier(max_depth=4, random_state=319)

In [82]:
clf = clf.fit(X_train, y_train)
clf

DecisionTreeClassifier(max_depth=4, random_state=319)

In [47]:
dot_data = export_graphviz(clf, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True, format="pdf")

'titanic_decision_tree.pdf'

In [83]:
y_pred = clf.predict(X_train)
y_pred[0:3]

array([0, 0, 0])

In [84]:
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[0:3]

array([[0.6626506, 0.3373494],
       [0.6626506, 0.3373494],
       [1.       , 0.       ]])

In [85]:
y_train.head(3)

583    0
165    1
50     0
Name: survived, dtype: int64

In [86]:
confusion_matrix(y_train, y_pred)

array([[279,  28],
       [ 48, 143]])

In [87]:
labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,279,28
1,48,143


In [88]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.91      0.88       307
           1       0.84      0.75      0.79       191

    accuracy                           0.85       498
   macro avg       0.84      0.83      0.84       498
weighted avg       0.85      0.85      0.85       498



## 6. Which model performs better on your in-sample data?

### The model with max depth of 4 appears to perform the best

## 7. Which model performs best on your out-of-sample data, the validate set?

### Max depth of 2

In [64]:
y_pred = clf.predict(X_validate)
y_pred[0:3]

array([1, 0, 0])

In [65]:
y_validate.head(3)

610    0
424    0
568    0
Name: survived, dtype: int64

In [66]:
y_pred_proba = clf.predict_proba(X_validate)
y_pred_proba[0:3]

array([[0.4691358 , 0.5308642 ],
       [0.68644068, 0.31355932],
       [0.89756098, 0.10243902]])

In [68]:
labels = sorted(y_validate.unique())

pd.DataFrame(confusion_matrix(y_validate, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,109,23
1,28,54


In [69]:
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.83      0.81       132
           1       0.70      0.66      0.68        82

    accuracy                           0.76       214
   macro avg       0.75      0.74      0.74       214
weighted avg       0.76      0.76      0.76       214



### max depth 3

In [76]:
y_pred = clf.predict(X_validate)
y_pred[0:3]

array([0, 0, 0])

In [77]:
y_validate.head(3)

610    0
424    0
568    0
Name: survived, dtype: int64

In [78]:
y_pred_proba = clf.predict_proba(X_validate)
y_pred_proba[0:3]

array([[0.91666667, 0.08333333],
       [0.62222222, 0.37777778],
       [0.90640394, 0.09359606]])

In [79]:
labels = sorted(y_validate.unique())

pd.DataFrame(confusion_matrix(y_validate, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,116,16
1,27,55


In [80]:
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.88      0.84       132
           1       0.77      0.67      0.72        82

    accuracy                           0.80       214
   macro avg       0.79      0.77      0.78       214
weighted avg       0.80      0.80      0.80       214



### max depth 4

In [89]:
y_pred = clf.predict(X_validate)
y_pred[0:3]

array([0, 0, 0])

In [90]:
y_validate.head(3)

610    0
424    0
568    0
Name: survived, dtype: int64

In [91]:
y_pred_proba = clf.predict_proba(X_validate)
y_pred_proba[0:3]

array([[1.        , 0.        ],
       [0.6626506 , 0.3373494 ],
       [0.91919192, 0.08080808]])

In [92]:
labels = sorted(y_validate.unique())

pd.DataFrame(confusion_matrix(y_validate, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,112,20
1,25,57


In [93]:
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.85      0.83       132
           1       0.74      0.70      0.72        82

    accuracy                           0.79       214
   macro avg       0.78      0.77      0.77       214
weighted avg       0.79      0.79      0.79       214

