In [159]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import acquire
import prepare

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

import graphviz
from graphviz import Graph

## Using titanic data

In [160]:
# pulling in titanic data with function
df = acquire.get_titanic_data()
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [161]:
# splitting titanic data into train, test, validate with function
train, validate, test = prepare.prep_titanic_data(df)
train.shape

(498, 14)

In [107]:
validate.shape

(214, 14)

In [108]:
test.shape

(179, 14)

In [109]:
train.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone,sex_female,sex_male,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
583,0,1,male,36.0,0,0,40.125,Cherbourg,1,0,1,1,0,0
165,1,3,male,9.0,0,2,20.525,Southampton,0,0,1,0,0,1
50,0,3,male,7.0,4,1,39.6875,Southampton,0,0,1,0,0,1
259,1,2,female,50.0,0,1,26.0,Southampton,0,1,0,0,0,1
306,1,1,female,,0,0,110.8833,Cherbourg,1,1,0,1,0,0


In [68]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 498 entries, 583 to 744
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   survived                 498 non-null    int64  
 1   pclass                   498 non-null    int64  
 2   sex                      498 non-null    object 
 3   age                      401 non-null    float64
 4   sibsp                    498 non-null    int64  
 5   parch                    498 non-null    int64  
 6   fare                     498 non-null    float64
 7   embark_town              498 non-null    object 
 8   alone                    498 non-null    int64  
 9   sex_female               498 non-null    uint8  
 10  sex_male                 498 non-null    uint8  
 11  embark_town_Cherbourg    498 non-null    uint8  
 12  embark_town_Queenstown   498 non-null    uint8  
 13  embark_town_Southampton  498 non-null    uint8  
dtypes: float64(2), int64(5),

## Exercise 1

What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [110]:
train.survived.value_counts()

0    307
1    191
Name: survived, dtype: int64

**Baseline is 0 (did not survive) since that is the most prevalent value**

In [162]:
# creating a baseline column to compare to actual
train['baseline'] = 0
train.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone,sex_female,sex_male,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton,baseline
583,0,1,male,36.0,0,0,40.125,Cherbourg,1,0,1,1,0,0,0
165,1,3,male,9.0,0,2,20.525,Southampton,0,0,1,0,0,1,0
50,0,3,male,7.0,4,1,39.6875,Southampton,0,0,1,0,0,1,0
259,1,2,female,50.0,0,1,26.0,Southampton,0,1,0,0,0,1,0
306,1,1,female,,0,0,110.8833,Cherbourg,1,1,0,1,0,0,0


In [163]:
# calculating baseline accuracy
(train.baseline == train.survived).mean()

0.6164658634538153

**Baseline accuracy is 62% so, to add value, a model needs to have greater accuracy**

## Exercise 2 

Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [164]:
# creating x and y version of train where x includes everything but the target variable and y contains only the target variable
x_train = train.drop(columns=['survived', 'baseline', 'sex', 'embark_town', 'sex_female', 'age'])
y_train = train.survived

x_validate = validate.drop(columns=['survived', 'sex', 'embark_town', 'sex_female', 'age'])
y_validate = validate.survived

x_test = test.drop(columns=['survived', 'sex', 'embark_town', 'sex_female', 'age'])
y_test = test.survived

In [145]:
# creating the Decision Tree object with desired hyper-parameters
clf = DecisionTreeClassifier(max_depth=3, random_state=123)

In [146]:
# fitting the algorithm to the training data
clf = clf.fit(x_train, y_train)

In [139]:
# creating visualization of tree
dot_data = export_graphviz(clf, feature_names= x_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True)

'titanic_decision_tree.pdf'

In [147]:
# making prediction on observations
y_pred = clf.predict(x_train)
y_pred[0:5]

array([0, 0, 0, 1, 1])

In [148]:
# estimating probability of each species
y_pred_proba = clf.predict_proba(x_train)
y_pred_proba[0:5]

array([[0.69827586, 0.30172414],
       [0.69827586, 0.30172414],
       [0.69827586, 0.30172414],
       [0.07142857, 0.92857143],
       [0.01923077, 0.98076923]])

## Exercise 3. 

Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [149]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(x_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.82


In [120]:
# creating confusion matrix
confusion_matrix(y_train, y_pred)

array([[276,  31],
       [ 57, 134]])

In [121]:
# adding clarity
labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,276,31
1,57,134


In [122]:
# creating classification report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.90      0.86       307
           1       0.81      0.70      0.75       191

    accuracy                           0.82       498
   macro avg       0.82      0.80      0.81       498
weighted avg       0.82      0.82      0.82       498



## Exercise 4 

Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [123]:
# confusion matrix with tp (upper left), fp (lower left), tn (lower right), & fn (upper right)
pd.DataFrame(confusion_matrix(y_train, y_pred, normalize='true'), index=labels, columns=labels)

Unnamed: 0,0,1
0,0.899023,0.100977
1,0.298429,0.701571


In [124]:
# classification report with accuracy, precision, recall, f1-score, & support
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.90      0.86       307
           1       0.81      0.70      0.75       191

    accuracy                           0.82       498
   macro avg       0.82      0.80      0.81       498
weighted avg       0.82      0.82      0.82       498



## Exercise 5, Step II

Run through steps 2-4 using a different max_depth value.

In [152]:
# creating the Decision Tree object with desired hyper-parameters
clf = DecisionTreeClassifier(max_depth=5, random_state=123)

In [153]:
# fitting the algorithm to the training data
clf = clf.fit(x_train, y_train)

In [127]:
# creating visualization of tree
dot_data = export_graphviz(clf, feature_names= x_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True)

'titanic_decision_tree.pdf'

In [154]:
# making prediction on observations
y_pred = clf.predict(x_train)
y_pred[0:5]

array([0, 0, 0, 1, 1])

In [155]:
# estimating probability of each species
y_pred_proba = clf.predict_proba(x_train)
y_pred_proba[0:5]

array([[0.62025316, 0.37974684],
       [0.82608696, 0.17391304],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.        , 1.        ]])

## Exercise 5, Step III

Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [156]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(x_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.84


In [131]:
# creating confusion matrix
confusion_matrix(y_train, y_pred)

array([[303,   4],
       [ 77, 114]])

In [132]:
# adding clarity
labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,303,4
1,77,114


In [133]:
# creating classification report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.99      0.88       307
           1       0.97      0.60      0.74       191

    accuracy                           0.84       498
   macro avg       0.88      0.79      0.81       498
weighted avg       0.86      0.84      0.83       498



## Exercise 5, Step IV 

Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [134]:
# confusion matrix with tp (upper left), fp (lower left), tn (lower right), & fn (upper right)
pd.DataFrame(confusion_matrix(y_train, y_pred, normalize='true'), index=labels, columns=labels)

Unnamed: 0,0,1
0,0.986971,0.013029
1,0.403141,0.596859


In [135]:
# classification report with accuracy, precision, recall, f1-score, & support
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.99      0.88       307
           1       0.97      0.60      0.74       191

    accuracy                           0.84       498
   macro avg       0.88      0.79      0.81       498
weighted avg       0.86      0.84      0.83       498



## Exercise 6

Which model performs better on your in-sample data?

**Model with max_depth_value of 5 performed slightly better on in-sample data (0.84 accuracy) than that with max_depth_value of 3 (0.82 accuracy)**

## Exercise 7, 

Which model performs best on your out-of-sample data, the validate set?

In [157]:
# running this after creating and running Decision Tree classifier with desired hyper-parameters and fitting to the data
# first with max_depth set to 3, then with max_depth set to 5
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf.score(x_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.75


**Model with max_depth set to 3 performs better on the validate set with accuracy of 0.79**

**Model with max_depth set to 5 has accuracy of 0.75**

# Random Forest Exercises

### Exercise 1

Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [167]:
# creating random forest object with desired hyper-parameters
rf = RandomForestClassifier(max_depth=10, 
                            random_state=369, min_samples_leaf=1)
rf

RandomForestClassifier(max_depth=10, random_state=369)

In [169]:
# fitting model to train set
rf.fit(x_train, y_train)

RandomForestClassifier(max_depth=10, random_state=369)

In [170]:
# evaluating importance of each feature, higher score = more importance 
print(rf.feature_importances_)

[0.10270126 0.06734553 0.04841256 0.36655503 0.02072547 0.344281
 0.01592391 0.01512833 0.0189269 ]


In [172]:
x_train.head()

Unnamed: 0,pclass,sibsp,parch,fare,alone,sex_male,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
583,1,0,0,40.125,1,1,1,0,0
165,3,0,2,20.525,0,1,0,0,1
50,3,4,1,39.6875,0,1,0,0,1
259,2,0,1,26.0,0,0,0,0,1
306,1,0,0,110.8833,1,0,1,0,0


In [173]:
# classifying each observation as survived or not survived
y_pred = rf.predict(x_train)
y_pred

array([0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,

In [175]:
# estimating the probability of those classifications
y_pred_proba = rf.predict_proba(x_train)
y_pred_proba

array([[0.93859479, 0.06140521],
       [0.31441667, 0.68558333],
       [0.98071429, 0.01928571],
       [0.02132669, 0.97867331],
       [0.        , 1.        ],
       [0.88322304, 0.11677696],
       [0.66706873, 0.33293127],
       [0.88659879, 0.11340121],
       [0.88001493, 0.11998507],
       [0.99      , 0.01      ],
       [0.63955631, 0.36044369],
       [0.53950668, 0.46049332],
       [0.00820764, 0.99179236],
       [0.62189224, 0.37810776],
       [0.83655461, 0.16344539],
       [0.93728698, 0.06271302],
       [0.93728698, 0.06271302],
       [0.00589262, 0.99410738],
       [0.80129522, 0.19870478],
       [0.919     , 0.081     ],
       [0.09020581, 0.90979419],
       [0.9823167 , 0.0176833 ],
       [0.00277778, 0.99722222],
       [0.00277778, 0.99722222],
       [0.53950668, 0.46049332],
       [0.97021473, 0.02978527],
       [0.0125    , 0.9875    ],
       [0.64      , 0.36      ],
       [0.8268039 , 0.1731961 ],
       [0.80129522, 0.19870478],
       [0.

## Exercise 2

Evaluate your results using the model score, confusion matrix, and classification report.

In [177]:
# calculating accuracy of the model
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(x_train, y_train)))

Accuracy of random forest classifier on training set: 0.94


In [178]:
# creating confusion matrix
print(confusion_matrix(y_train, y_pred))

[[302   5]
 [ 23 168]]


In [180]:
# creating classification report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.98      0.96       307
           1       0.97      0.88      0.92       191

    accuracy                           0.94       498
   macro avg       0.95      0.93      0.94       498
weighted avg       0.95      0.94      0.94       498



## Exercise 3

Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

**In this case, positive = 0 (did not survive), negative = 1 (survived)**

In [192]:
tp = 302
tn = 168
fp = 23
fn = 5
all_4 = tp + tn + fp + fn

accuracy = (tp + tn) / all_4
true_positive_rate = tp / (tp + fn)
false_positive_rate = fp / (fp + tn)
true_negative_rate = tn / (tn + fp)
false_negative_rate = fn /(fn + tp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2 * ((precision * recall) / (precision + recall))
support = (y_train == 0).sum(), (y_train == 1).sum()

In [198]:
print(f'accuracy: {accuracy}')
print(f'true_positive_rate: {true_positive_rate}')
print(f'false_positive_rate: {false_positive_rate}')
print(f'true_negative_rate: {true_negative_rate}')
print(f'false_negative_rate: {false_negative_rate}')
print(f'precision: {precision}')
print(f'recall: {recall}')
print(f'f1_score: {f1_score}')
print(f'support: {support}')

accuracy: 0.9437751004016064
true_positive_rate: 0.9837133550488599
false_positive_rate: 0.12041884816753927
true_negative_rate: 0.8795811518324608
false_negative_rate: 0.016286644951140065
precision: 0.9292307692307692
recall: 0.9837133550488599
f1_score: 0.9556962025316454
support: (307, 191)


## Exercise 4

Run through steps increasing your min_samples_leaf and decreasing your max_depth.