In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from acquire import get_titanic_data

from prepare_cu import prep_titanic
import warnings
warnings.filterwarnings('ignore')
from sklearn.tree import export_graphviz
import graphviz

In [2]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score

In [3]:
# acquire the data
df = get_titanic_data()

# prepare the data
train, validate, test = prep_titanic(df)

In [4]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,Q,S
583,583,0,1,male,36.0,0,0,40.125,C,First,Cherbourg,1,0,0
337,337,1,1,female,41.0,0,0,134.5,C,First,Cherbourg,1,0,0
50,50,0,3,male,7.0,4,1,39.6875,S,Third,Southampton,0,0,1
218,218,1,1,female,32.0,0,0,76.2917,C,First,Cherbourg,1,0,0
31,31,1,1,female,29.916875,1,0,146.5208,C,First,Cherbourg,0,0,0


In [5]:
# drop out non-numerical columns or non-encoded version remaining in this data set
drops = ['sex', 'class','embarked', 'embark_town', 'passenger_id']

In [6]:
[dataset.drop(columns=drops, inplace=True) for dataset in [train, validate, test]]

[None, None, None]

In [7]:
train.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,Q,S
583,0,1,36.0,0,0,40.125,1,0,0
337,1,1,41.0,0,0,134.5,1,0,0
50,0,3,7.0,4,1,39.6875,0,0,1
218,1,1,32.0,0,0,76.2917,1,0,0
31,1,1,29.916875,1,0,146.5208,0,0,0


In [8]:
# Decision Tree Classifier Model will predict survival

# Decision Tree Exercises
 - https://ds.codeup.com/classification/decision-trees/#installing-graphviz

# 1. What is your baseline prediction? What is your baseline accuracy?
- Baseline prediction is 62% accuracy when always predicting death

In [9]:
# obtain our mode
train.survived.value_counts()

0    307
1    190
Name: survived, dtype: int64

In [10]:
#baseline is death = 0 as there are 307 deaths (mode) and only 190 suvivors
train['baseline_death'] = 0

In [11]:
train.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,Q,S,baseline_death
583,0,1,36.0,0,0,40.125,1,0,0,0
337,1,1,41.0,0,0,134.5,1,0,0,0
50,0,3,7.0,4,1,39.6875,0,0,1,0
218,1,1,32.0,0,0,76.2917,1,0,0,0
31,1,1,29.916875,1,0,146.5208,0,0,0,0


In [12]:
print(f'Our baseline accuracy for nonsurvival in all cases on the Titanic Dataset is {(train.baseline_death == train.survived).mean():.3}')

Our baseline accuracy for nonsurvival in all cases on the Titanic Dataset is 0.618


In [13]:
baseline_class_report = classification_report(train.survived, train.baseline_death, zero_division=True)
print(baseline_class_report)

              precision    recall  f1-score   support

           0       0.62      1.00      0.76       307
           1       1.00      0.00      0.00       190

    accuracy                           0.62       497
   macro avg       0.81      0.50      0.38       497
weighted avg       0.76      0.62      0.47       497



# 2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [14]:
#create the model
dtc = DecisionTreeClassifier()

In [64]:
# remove baseline assumption from the train
train.drop(columns='baseline_death', inplace=True)

KeyError: "['baseline_death'] not found in axis"

In [21]:
#split our X and y
X_train = train.drop(columns='survived')
y_train = train[['survived']]

In [23]:
# fit the model
dtc.fit(X_train, y_train)

DecisionTreeClassifier()

In [24]:
# designate our X and y
X = X_train
y = y_train

In [28]:
accuracy = dtc.score(X, y)
y_pred = dtc.predict(X)
conf = confusion_matrix(y, y_pred)
#class report
class_report = pd.DataFrame(classification_report(y, y_pred, output_dict=True)).T
conf = confusion_matrix(y, y_pred)
#true positive rate
tpr = conf[1][1] / conf[1].sum()
#false positvive rate
fpr = conf[0][1] / conf[0].sum()
#true negative rate
tnr = conf[0][0] / conf[0].sum()
#false negative rate
fnr = conf[1][0] / conf[1].sum()

print(f'''
The accuracy for our model is {accuracy:.4}
The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
''')
class_report



The accuracy for our model is 0.9738
The True Positive Rate is 0.968, The False Positive Rate is 0.0228,
The True Negative Rate is 0.977, and the False Negative Rate is 0.0316



Unnamed: 0,precision,recall,f1-score,support
0,0.980392,0.977199,0.978793,307.0
1,0.963351,0.968421,0.965879,190.0
accuracy,0.973843,0.973843,0.973843,0.973843
macro avg,0.971871,0.97281,0.972336,497.0
weighted avg,0.973877,0.973843,0.973856,497.0


# 3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [29]:
# use the model to predict
#dtc
y_pred = dtc.predict(X_train)


In [30]:
# check the values in the predictions
pd.Series(y_pred).value_counts()

0    306
1    191
dtype: int64

In [31]:
# model score: accuracy
accuracy = dtc.score(X_train, y_train)

In [32]:
accuracy

0.9738430583501007

In [33]:
# confusion matrix
conf = confusion_matrix(y_train, y_pred)

In [34]:
conf

array([[300,   7],
       [  6, 184]])

In [35]:
# get the classification report
class_report = classification_report(y_train, y_pred, output_dict=True)

In [36]:
class_report

{'0': {'precision': 0.9803921568627451,
  'recall': 0.9771986970684039,
  'f1-score': 0.9787928221859705,
  'support': 307},
 '1': {'precision': 0.9633507853403142,
  'recall': 0.968421052631579,
  'f1-score': 0.9658792650918635,
  'support': 190},
 'accuracy': 0.9738430583501007,
 'macro avg': {'precision': 0.9718714711015296,
  'recall': 0.9728098748499914,
  'f1-score': 0.972336043638917,
  'support': 497},
 'weighted avg': {'precision': 0.9738773468239887,
  'recall': 0.9738430583501007,
  'f1-score': 0.9738560498562314,
  'support': 497}}

In [37]:
pd.DataFrame(class_report).rename(columns={'0': 'deceased', '1': 'survived'}).T

Unnamed: 0,precision,recall,f1-score,support
deceased,0.980392,0.977199,0.978793,307.0
survived,0.963351,0.968421,0.965879,190.0
accuracy,0.973843,0.973843,0.973843,0.973843
macro avg,0.971871,0.97281,0.972336,497.0
weighted avg,0.973877,0.973843,0.973856,497.0


# 4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [38]:
conf

array([[300,   7],
       [  6, 184]])

In [39]:
# turn our confusio matrix into a dataframe for uman legibility:
conf_df = pd.DataFrame(conf, columns=['predict death', 'predict_survive'], index=['actual_death', 'actual_survive'])

In [40]:
conf_df

Unnamed: 0,predict death,predict_survive
actual_death,300,7
actual_survive,6,184


In [45]:
# make a key for reference
rubric_df = pd.DataFrame([['true negative', 'false positive'],['false negative', 'true positive']], columns=['predict_death', 'predict_survive'], index=['actual_death', 'actual_survive'])

In [46]:
rubric_df

Unnamed: 0,predict_death,predict_survive
actual_death,true negative,false positive
actual_survive,false negative,true positive


In [47]:
joined = pd.concat([conf_df, rubric_df], axis=1)

In [48]:
rubric_df + ':'+ conf_df.values.astype(str)

Unnamed: 0,predict_death,predict_survive
actual_death,true negative:300,false positive:7
actual_survive,false negative:6,true positive:184


# 5. Run through steps 2-4 using a different max_depth value.


In [49]:
#dtc2
dtc2 = DecisionTreeClassifier(max_depth=3)

In [None]:
# fit the model

In [50]:
dtc2.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=3)

In [51]:
y_pred = dtc2.predict(X_train)

# 6. Which model performs better on your in-sample data?

In [53]:
# Model #1:
accuracy = dtc.score(X, y)
y_pred = dtc.predict(X)
conf = confusion_matrix(y, y_pred)
class_report = pd.DataFrame(classification_report(y, y_pred, output_dict=True)).T
conf = confusion_matrix(y, y_pred)
tpr = conf[1][1] / conf[1].sum()
fpr = conf[0][1] / conf[0].sum()
tnr = conf[0][0] / conf[0].sum()
fnr = conf[1][0] / conf[1].sum()
print(f'''
The accuracy for our model is {accuracy:.4}
The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
''')
class_report


The accuracy for our model is 0.9738
The True Positive Rate is 0.968, The False Positive Rate is 0.0228,
The True Negative Rate is 0.977, and the False Negative Rate is 0.0316



Unnamed: 0,precision,recall,f1-score,support
0,0.980392,0.977199,0.978793,307.0
1,0.963351,0.968421,0.965879,190.0
accuracy,0.973843,0.973843,0.973843,0.973843
macro avg,0.971871,0.97281,0.972336,497.0
weighted avg,0.973877,0.973843,0.973856,497.0


In [54]:

# Model #2:
accuracy = dtc2.score(X, y)
y_pred = dtc2.predict(X)
conf = confusion_matrix(y, y_pred)
class_report = pd.DataFrame(classification_report(y, y_pred, output_dict=True)).T
conf = confusion_matrix(y, y_pred)
tpr = conf[1][1] / conf[1].sum()
fpr = conf[0][1] / conf[0].sum()
tnr = conf[0][0] / conf[0].sum()
fnr = conf[1][0] / conf[1].sum()
print(f'''
The accuracy for our model is {accuracy:.4}
The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
''')
class_report


The accuracy for our model is 0.7123
The True Positive Rate is 0.342, The False Positive Rate is 0.0586,
The True Negative Rate is 0.941, and the False Negative Rate is 0.658



Unnamed: 0,precision,recall,f1-score,support
0,0.698068,0.941368,0.801664,307.0
1,0.783133,0.342105,0.47619,190.0
accuracy,0.712274,0.712274,0.712274,0.712274
macro avg,0.7406,0.641737,0.638927,497.0
weighted avg,0.730587,0.712274,0.677238,497.0


# 7. Which model performs best on your out-of-sample data, the validate set?

In [55]:
# get predictions for our validation sets
y_val_pred_1 = dtc.predict(validate.drop(columns='survived'))
y_val_pred_2 = dtc2.predict(validate.drop(columns='survived'))

In [59]:
# get validation accuracy
accuracy_v_1 = dtc.score(validate.drop(columns='survived'), validate.survived)
accuracy_v_2 = dtc2.score(validate.drop(columns='survived'), validate.survived)

In [60]:
# model 1
accuracy_v_1

0.6869158878504673

In [62]:
# model 2
accuracy_v_2

0.7102803738317757

In [64]:

dot_data = export_graphviz(dtc2, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data)

In [67]:
graph.render('titanic_model_2_tree', view=True)

'titanic_model_2_tree.pdf'

In [68]:

dot_data = export_graphviz(dtc, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data)

In [69]:
graph.render('titanic_model_1_tree', view=True)

'titanic_model_1_tree.pdf'

# Random Forrest Exercises
    - https://ds.codeup.com/classification/random-forests/

1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [2]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from acquire_cu import get_titanic_data
from prepare_cu import prep_titanic
import warnings
warnings.filterwarnings('ignore')

In [3]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score

In [4]:
#acquire the data
df = get_titanic_data()
#prepare the data
train, validate, test = prep_titanic(df)

In [5]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,Q,S
583,583,0,1,male,36.0,0,0,40.125,C,First,Cherbourg,1,0,0
337,337,1,1,female,41.0,0,0,134.5,C,First,Cherbourg,1,0,0
50,50,0,3,male,7.0,4,1,39.6875,S,Third,Southampton,0,0,1
218,218,1,1,female,32.0,0,0,76.2917,C,First,Cherbourg,1,0,0
31,31,1,1,female,29.916875,1,0,146.5208,C,First,Cherbourg,0,0,0


In [6]:
# Drop any non-numerical columns in the data set
drops = ['sex', 'class', 'embarked', 'embark_town', 'passenger_id']

In [7]:
for dataset in [train, validate, test]: 
    dataset.drop(columns=drops, inplace=True)

In [8]:
train.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,Q,S
583,0,1,36.0,0,0,40.125,1,0,0
337,1,1,41.0,0,0,134.5,1,0,0
50,0,3,7.0,4,1,39.6875,0,0,1
218,1,1,32.0,0,0,76.2917,1,0,0
31,1,1,29.916875,1,0,146.5208,0,0,0


In [39]:
#create baseline is at 62 %
baseline = (y_train.value_counts().idxmax() == y_train).mean()
baseline

0.6177062374245473

In [40]:
# Create Random Forrest Model

clf = RandomForestClassifier(min_samples_leaf = 1, max_depth = 10, random_state = 1349)

In [41]:
#fit the model after splitting our X and y
X_train, y_train = train.drop(columns='survived'), train.survived

In [42]:
# fit the thing
clf.fit(X_train, y_train)

RandomForestClassifier(max_depth=10, random_state=1349)

In [65]:
# use the thing
y_pred = clf.predict(X_train)

# 2. Evaluate your results using the model score, confusion matrix, and classification report.

# 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [66]:
clf_score = clf.score(X_train, y_train)
conf = confusion_matrix(y_train, y_pred)
tpr = conf[1][1] / conf[1].sum()
fpr = conf[0][1] / conf[0].sum()
tnr = conf[0][0] / conf[0].sum()
fnr = conf[1][0] / conf[1].sum()
print(f'''
    The accuracy for our model is {clf_score:.4}
    The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
    The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
    ''')
pd.DataFrame(classification_report(y_train, y_pred, output_dict=True))


    The accuracy for our model is 0.9276
    The True Positive Rate is 0.868, The False Positive Rate is 0.0358,
    The True Negative Rate is 0.964, and the False Negative Rate is 0.132
    


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.922118,0.9375,0.927565,0.929809,0.927999
recall,0.964169,0.868421,0.927565,0.916295,0.927565
f1-score,0.942675,0.901639,0.927565,0.922157,0.926987
support,307.0,190.0,0.927565,497.0,497.0


# 4. Run through steps increasing your min_samples_leaf and decreasing your max_depth.

In [67]:
# Create Random Forest Model

clf1 = RandomForestClassifier(min_samples_leaf=3, max_depth=3, random_state=1349)

In [68]:
# Fit the model
clf1.fit(X_train, y_train)

RandomForestClassifier(max_depth=3, min_samples_leaf=3, random_state=1349)

In [69]:
y_pred1 = clf1.predict(X_train)
clf_score = clf1.score(X_train, y_train)
conf = confusion_matrix(y_train, y_pred1)
tpr = conf[1][1] / conf[1].sum()
fpr = conf[0][1] / conf[0].sum()
tnr = conf[0][0] / conf[0].sum()
fnr = conf[1][0] / conf[1].sum()
print(f'''
    The accuracy for our model is {clf_score:.4}
    The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
    The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
    ''')
pd.DataFrame(classification_report(y_train, y_pred, output_dict=True))


    The accuracy for our model is 0.7525
    The True Positive Rate is 0.537, The False Positive Rate is 0.114,
    The True Negative Rate is 0.886, and the False Negative Rate is 0.463
    


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.922118,0.9375,0.927565,0.929809,0.927999
recall,0.964169,0.868421,0.927565,0.916295,0.927565
f1-score,0.942675,0.901639,0.927565,0.922157,0.926987
support,307.0,190.0,0.927565,497.0,497.0


In [70]:
X_val, y_val = validate.drop(columns='survived'), validate.survived

In [71]:
print('Model #1: min samples 1, max depth 10: ON VALIDATE SET')
clf_score = clf.score(X_val, y_val)
y_pred_val = clf.predict(X_val)
conf = confusion_matrix(y_val, y_pred_val)
tpr = conf[1][1] / conf[1].sum()
fpr = conf[0][1] / conf[0].sum()
tnr = conf[0][0] / conf[0].sum()
fnr = conf[1][0] / conf[1].sum()
print(f'''
    The accuracy for our model is {clf_score:.4}
    The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
    The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
    ''')
print('-------------------------------------------\n Model #2: min samples 3, max_depth 3 : ON VALIDATE SET\n')
clf_score = clf1.score(X_val, y_val)
y_pred_val1 = clf1.predict(X_val)
conf = confusion_matrix(y_val, y_pred_val1)
tpr = conf[1][1] / conf[1].sum()
fpr = conf[0][1] / conf[0].sum()
tnr = conf[0][0] / conf[0].sum()
fnr = conf[1][0] / conf[1].sum()
print(f'''
    The accuracy for our model is {clf_score:.4}
    The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
    The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
    ''')

Model #1: min samples 1, max depth 10: ON VALIDATE SET

    The accuracy for our model is 0.7617
    The True Positive Rate is 0.646, The False Positive Rate is 0.167,
    The True Negative Rate is 0.833, and the False Negative Rate is 0.354
    
-------------------------------------------
 Model #2: min samples 3, max_depth 3 : ON VALIDATE SET


    The accuracy for our model is 0.743
    The True Positive Rate is 0.5, The False Positive Rate is 0.106,
    The True Negative Rate is 0.894, and the False Negative Rate is 0.5
    


# 5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

In [72]:
conf = confusion_matrix(y_train, y_pred)

In [73]:
conf

array([[296,  11],
       [ 25, 165]])

In [74]:
# key for reference
rubric_df = pd.DataFrame([['true negative', 'false positive'],['false negative', 'true positive']], columns=['predict_death', 'predict_survive'], index=['actual_death', 'actual_survive'])

In [75]:
rubric_df

Unnamed: 0,predict_death,predict_survive
actual_death,true negative,false positive
actual_survive,false negative,true positive


# 6. After making a few models, which one has the best performance (or closest metrics) on both train and validate?

In [76]:
clf.predict_proba(X_train)

array([[0.70104785, 0.29895215],
       [0.06711869, 0.93288131],
       [0.9758    , 0.0242    ],
       [0.10064358, 0.89935642],
       [0.0443181 , 0.9556819 ],
       [0.68404055, 0.31595945],
       [0.47006918, 0.52993082],
       [0.67405366, 0.32594634],
       [0.45560234, 0.54439766],
       [1.        , 0.        ],
       [0.82134381, 0.17865619],
       [0.89806093, 0.10193907],
       [0.13108025, 0.86891975],
       [0.59453012, 0.40546988],
       [0.60759589, 0.39240411],
       [0.38816308, 0.61183692],
       [0.93794664, 0.06205336],
       [0.18171429, 0.81828571],
       [0.5230589 , 0.4769411 ],
       [0.78048702, 0.21951298],
       [0.13383776, 0.86616224],
       [0.82246706, 0.17753294],
       [0.2079232 , 0.7920768 ],
       [0.19833333, 0.80166667],
       [0.38900423, 0.61099577],
       [0.93237155, 0.06762845],
       [0.12313969, 0.87686031],
       [0.77758204, 0.22241796],
       [0.58227628, 0.41772372],
       [0.67053517, 0.32946483],
       [0.