## Lots of Imports for tools that would normally be used in a project

In [63]:
# My created .py files for modularization
import env
import os
import acquire
import prepare
# Array and Dataframes
import numpy as np
import pandas as pd
# Load datasets
from pydataset import data
# Evaluation: Visualization
import seaborn as sns
import matplotlib.pyplot as plt
# Evaluation: Statistical Analysis
from scipy import stats
# Modeling
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score,\
precision_score,\
recall_score,\
classification_report,\
confusion_matrix
from sklearn.ensemble import\
RandomForestClassifier as rf

## Questions
#### Create a new notebook, random_forests, and work with titanic data to do the following:

> Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

Evaluate your results using the model score, confusion matrix, and classification report. 
### A: [Model Score](#model-score), [Confusion Matrix](#confusion-matrix), [Classification Report](#classification-report)

Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.
### A: [Accuracy, True Positive rate, False Positive rate, True Negative rate, False Negative rate, Precision, Recall, F1-Score, and Support](#a-accuracy-true-positive-rate-false-positive-rate-true-negative-rate-false-negative-rate-precision-recall-f1-score-and-support)

Run through steps increasing your min_samples_leaf and decreasing your max_depth.
### A: [Increase Min Leaf, Decrease max Depth](#increase-min-leaf-decrease-max-depth)

What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?
### A: [Comparing Model 1 Train and Validate Reports](#comparing-model-1-train-and-validate-reports)</br>The validation set does significantly worst than the training set in predicting those that will survive. - there is a pretty consistent base of metrics that will fluctuate. 
After making a few models, which one has the best performance (or closest metrics) on both train and validate?
### A: [Min Samples Leaves 1-5 , 1-20 Mad Depth](#min-samples-leaf-1-5)</br>With a min sample leaf size of 3 there is a consistent accuracy score with 1-20 max depth of about 90%. Although suboptimal it is very close

In [64]:
# acquiring and preparing data
# prep_titanic has acquire built in
titanic = prepare.prep_titanic()

## Splitting Data

In [65]:
train, validate, test  = prepare.splitter(titanic,target='survived')

In [66]:
# determining baseline
train['survived'].value_counts()

survived
0    307
1    191
Name: count, dtype: int64

## Determining Baseline

In [67]:
# finding baseline accuracy to compare to predictions
train['label'] = ['survived'] == '0'
baseline_accuracy = (train.survived == 0).mean()
baseline_accuracy

0.6164658634538153

## Model 1 Defined

In [68]:
# model 1
x_train = train.drop(columns=['survived','label','sex','embarked'])
y_train = train[['survived']]
y_train = y_train.values.ravel()

In [69]:
# setting parameters for decision tree
rf1 = rf(max_depth=10,min_samples_leaf=1,random_state=4343)

## Fit and Train Model

In [70]:
rf1.fit(x_train,y_train)

In [71]:
model1_predictions = rf1.predict(x_train)

## Model Score

In [72]:
rf1.score(x_train,y_train)

0.9718875502008032

## Confusion Matrix

In [73]:
pd.DataFrame(
    confusion_matrix(y_train,model1_predictions),
    columns = ['pred survived','pred opposite'], index=['actual opposite','actual survived']
)

Unnamed: 0,pred survived,pred opposite
actual opposite,307,0
actual survived,14,177


## Classification Report

In [95]:
model1_test_report = pd.DataFrame(classification_report(y_train,model1_predictions,output_dict=True)).T
model1_test_report

Unnamed: 0,precision,recall,f1-score,support
0,0.956386,1.0,0.977707,307.0
1,1.0,0.926702,0.961957,191.0
accuracy,0.971888,0.971888,0.971888,0.971888
macro avg,0.978193,0.963351,0.969832,498.0
weighted avg,0.973114,0.971888,0.971666,498.0


## Accuracy, True Positive rate, False Positive rate, True Negative rate, False Negative rate, Precision, Recall, F1-Score, and Support

In [75]:
def compute_metrics(TN,FP,FN,TP):
    all_ = (TP + TN + FP + FN)

    accuracy = (TP + TN) / all_

    TPR = recall = TP / (TP + FN)
    FPR = FP / (FP + TN)

    TNR = TN / (FP + TN)
    FNR = FN / (FN + TP)

    precision =  TP / (TP + FP)
    f1 =  2 * ((precision * recall) / ( precision + recall))

    support_pos = TP + FN
    support_neg = FP + TN

    print(f"Accuracy: {accuracy}\n")
    print(f"True Positive Rate/Sensitivity/Recall/Power: {TPR}")
    print(f"False Positive Rate/False Alarm Ratio/Fall-out: {FPR}")
    print(f"True Negative Rate/Specificity/Selectivity: {TNR}")
    print(f"False Negative Rate/Miss Rate: {FNR}\n")
    print(f"Precision/PPV: {precision}")
    print(f"F1 Score: {f1}\n")
    print(f"Support (0): {support_pos}")
    print(f"Support (1): {support_neg}")
confu = confusion_matrix(y_train,model1_predictions)
TN, FP, FN, TP = confu.ravel()
TN, FP, FN, TP

(307, 0, 14, 177)

In [76]:
compute_metrics(TN, FP, FN, TP)

Accuracy: 0.9718875502008032

True Positive Rate/Sensitivity/Recall/Power: 0.9267015706806283
False Positive Rate/False Alarm Ratio/Fall-out: 0.0
True Negative Rate/Specificity/Selectivity: 1.0
False Negative Rate/Miss Rate: 0.07329842931937172

Precision/PPV: 1.0
F1 Score: 0.9619565217391305

Support (0): 191
Support (1): 307


## Model 1 Validation

In [77]:
x_validate = validate.drop(columns=['survived','sex','embarked'])
y_validate = validate[['survived']]

In [78]:
model1_val = rf1.predict(x_validate)

In [79]:
rf1.score(x_validate,y_validate)

0.8037383177570093

In [80]:
pd.DataFrame(
    confusion_matrix(y_validate,model1_val),
    columns = ['pred survived','pred opposite'], index=['actual opposite','actual survived']
)

Unnamed: 0,pred survived,pred opposite
actual opposite,116,16
actual survived,26,56


In [94]:
model1_val_report = pd.DataFrame(classification_report(y_validate,model1_val,output_dict=True)).T
model1_val_report

Unnamed: 0,precision,recall,f1-score,support
0,0.816901,0.878788,0.846715,132.0
1,0.777778,0.682927,0.727273,82.0
accuracy,0.803738,0.803738,0.803738,0.803738
macro avg,0.79734,0.780857,0.786994,214.0
weighted avg,0.80191,0.803738,0.800948,214.0


## Comparing Model 1 Train and Validate Reports

In [97]:
print(model1_test_report)
model1_val_report

              precision    recall  f1-score     support
0              0.956386  1.000000  0.977707  307.000000
1              1.000000  0.926702  0.961957  191.000000
accuracy       0.971888  0.971888  0.971888    0.971888
macro avg      0.978193  0.963351  0.969832  498.000000
weighted avg   0.973114  0.971888  0.971666  498.000000


Unnamed: 0,precision,recall,f1-score,support
0,0.816901,0.878788,0.846715,132.0
1,0.777778,0.682927,0.727273,82.0
accuracy,0.803738,0.803738,0.803738,0.803738
macro avg,0.79734,0.780857,0.786994,214.0
weighted avg,0.80191,0.803738,0.800948,214.0


## Model 2

In [85]:
# model 2
x_train2 = train.drop(columns=['survived','label','sex','embarked'])
y_train2 = train[['survived']]
y_train2 = y_train2.values.ravel()

In [86]:
# setting parameters for decision tree
rf2 = rf(max_depth=6,min_samples_leaf=4,random_state=4343)
rf2.fit(x_train2,y_train2)

In [87]:
model2_predictions = rf1.predict(x_train2)

In [88]:
pd.DataFrame(classification_report(y_train2,model2_predictions,output_dict=True)).T

Unnamed: 0,precision,recall,f1-score,support
0,0.956386,1.0,0.977707,307.0
1,1.0,0.926702,0.961957,191.0
accuracy,0.971888,0.971888,0.971888,0.971888
macro avg,0.978193,0.963351,0.969832,498.0
weighted avg,0.973114,0.971888,0.971666,498.0


In [91]:
x_validate2 = validate.drop(columns=['survived','sex','embarked'])
y_validate2 = validate['survived']
y_validate2 = y_validate2.values.ravel()


In [92]:
model2_val = rf2.predict(x_validate2)

In [93]:
pd.DataFrame(classification_report(y_validate2,model2_val,output_dict=True)).T

Unnamed: 0,precision,recall,f1-score,support
0,0.80137,0.886364,0.841727,132.0
1,0.779412,0.646341,0.706667,82.0
accuracy,0.794393,0.794393,0.794393,0.794393
macro avg,0.790391,0.766353,0.774197,214.0
weighted avg,0.792956,0.794393,0.789975,214.0


## Increase Min Leaf, Decrease Max Depth

## Min Samples Leaf 1-5 
Checking for max_depth

In [None]:
# 1
for x in range(1,20):
        tree = rf(max_depth=x, random_state=4343,min_samples_leaf=1)
        tree.fit(x_train, y_train)
        acc = tree.score(x_train, y_train)
        print(f'for depth of {x:2}, the accuracy is {round(acc,2)}')

for depth of  1, the accuracy is 0.8
for depth of  2, the accuracy is 0.81
for depth of  3, the accuracy is 0.84
for depth of  4, the accuracy is 0.86
for depth of  5, the accuracy is 0.88
for depth of  6, the accuracy is 0.91
for depth of  7, the accuracy is 0.94
for depth of  8, the accuracy is 0.96
for depth of  9, the accuracy is 0.97
for depth of 10, the accuracy is 0.97
for depth of 11, the accuracy is 0.98
for depth of 12, the accuracy is 0.99
for depth of 13, the accuracy is 0.99
for depth of 14, the accuracy is 1.0
for depth of 15, the accuracy is 1.0
for depth of 16, the accuracy is 1.0
for depth of 17, the accuracy is 1.0
for depth of 18, the accuracy is 1.0
for depth of 19, the accuracy is 1.0


In [102]:
# 1
for x in range(1,20):
        tree = rf(max_depth=x, random_state=4343,min_samples_leaf=1)
        tree.fit(x_train, y_train)
        acc = tree.score(x_validate, y_validate)
        print(f'for depth of {x:2}, the accuracy is {round(acc,2)}')

for depth of  1, the accuracy is 0.76
for depth of  2, the accuracy is 0.79
for depth of  3, the accuracy is 0.79
for depth of  4, the accuracy is 0.81
for depth of  5, the accuracy is 0.81
for depth of  6, the accuracy is 0.8
for depth of  7, the accuracy is 0.82
for depth of  8, the accuracy is 0.81
for depth of  9, the accuracy is 0.81
for depth of 10, the accuracy is 0.8
for depth of 11, the accuracy is 0.8
for depth of 12, the accuracy is 0.8
for depth of 13, the accuracy is 0.81
for depth of 14, the accuracy is 0.81
for depth of 15, the accuracy is 0.81
for depth of 16, the accuracy is 0.81
for depth of 17, the accuracy is 0.8
for depth of 18, the accuracy is 0.8
for depth of 19, the accuracy is 0.8


In [None]:
# 2
for x in range(1,20):
        tree = rf(max_depth=x, random_state=4343,min_samples_leaf=2)
        tree.fit(x_train, y_train)
        acc = tree.score(x_train, y_train)
        print(f'for depth of {x:2}, the accuracy is {round(acc,2)}')

for depth of  1, the accuracy is 0.8
for depth of  2, the accuracy is 0.81
for depth of  3, the accuracy is 0.84
for depth of  4, the accuracy is 0.85
for depth of  5, the accuracy is 0.87
for depth of  6, the accuracy is 0.9
for depth of  7, the accuracy is 0.91
for depth of  8, the accuracy is 0.93
for depth of  9, the accuracy is 0.93
for depth of 10, the accuracy is 0.94
for depth of 11, the accuracy is 0.94
for depth of 12, the accuracy is 0.94
for depth of 13, the accuracy is 0.93
for depth of 14, the accuracy is 0.94
for depth of 15, the accuracy is 0.94
for depth of 16, the accuracy is 0.94
for depth of 17, the accuracy is 0.94
for depth of 18, the accuracy is 0.94
for depth of 19, the accuracy is 0.94


In [101]:
# 2 val
for x in range(1,20):
        tree = rf(max_depth=x, random_state=4343,min_samples_leaf=2)
        tree.fit(x_train, y_train)
        acc = tree.score(x_validate, y_validate)
        print(f'for depth of {x:2}, the accuracy is {round(acc,2)}')

for depth of  1, the accuracy is 0.76
for depth of  2, the accuracy is 0.79
for depth of  3, the accuracy is 0.79
for depth of  4, the accuracy is 0.8
for depth of  5, the accuracy is 0.8
for depth of  6, the accuracy is 0.8
for depth of  7, the accuracy is 0.81
for depth of  8, the accuracy is 0.79
for depth of  9, the accuracy is 0.8
for depth of 10, the accuracy is 0.79
for depth of 11, the accuracy is 0.79
for depth of 12, the accuracy is 0.79
for depth of 13, the accuracy is 0.79
for depth of 14, the accuracy is 0.79
for depth of 15, the accuracy is 0.79
for depth of 16, the accuracy is 0.79
for depth of 17, the accuracy is 0.79
for depth of 18, the accuracy is 0.79
for depth of 19, the accuracy is 0.79


In [None]:
# 3
for x in range(1,20):
        tree = rf(max_depth=x, random_state=4343,min_samples_leaf=3)
        tree.fit(x_train, y_train)
        acc = tree.score(x_train, y_train)
        print(f'for depth of {x:2}, the accuracy is {round(acc,2)}')

for depth of  1, the accuracy is 0.8
for depth of  2, the accuracy is 0.81
for depth of  3, the accuracy is 0.84
for depth of  4, the accuracy is 0.85
for depth of  5, the accuracy is 0.86
for depth of  6, the accuracy is 0.89
for depth of  7, the accuracy is 0.9
for depth of  8, the accuracy is 0.91
for depth of  9, the accuracy is 0.92
for depth of 10, the accuracy is 0.92
for depth of 11, the accuracy is 0.92
for depth of 12, the accuracy is 0.92
for depth of 13, the accuracy is 0.92
for depth of 14, the accuracy is 0.92
for depth of 15, the accuracy is 0.92
for depth of 16, the accuracy is 0.92
for depth of 17, the accuracy is 0.92
for depth of 18, the accuracy is 0.92
for depth of 19, the accuracy is 0.92


In [100]:
# 3 val
for x in range(1,20):
        tree = rf(max_depth=x, random_state=4343,min_samples_leaf=3)
        tree.fit(x_train, y_train)
        acc = tree.score(x_validate, y_validate)
        print(f'for depth of {x:2}, the accuracy is {round(acc,2)}')

for depth of  1, the accuracy is 0.76
for depth of  2, the accuracy is 0.79
for depth of  3, the accuracy is 0.79
for depth of  4, the accuracy is 0.8
for depth of  5, the accuracy is 0.81
for depth of  6, the accuracy is 0.81
for depth of  7, the accuracy is 0.79
for depth of  8, the accuracy is 0.79
for depth of  9, the accuracy is 0.81
for depth of 10, the accuracy is 0.81
for depth of 11, the accuracy is 0.81
for depth of 12, the accuracy is 0.82
for depth of 13, the accuracy is 0.81
for depth of 14, the accuracy is 0.81
for depth of 15, the accuracy is 0.81
for depth of 16, the accuracy is 0.81
for depth of 17, the accuracy is 0.81
for depth of 18, the accuracy is 0.81
for depth of 19, the accuracy is 0.81


In [None]:
# 4
for x in range(1,20):
        tree = rf(max_depth=x, random_state=4343,min_samples_leaf=4)
        tree.fit(x_train, y_train)
        acc = tree.score(x_train, y_train)
        print(f'for depth of {x:2}, the accuracy is {round(acc,2)}')

for depth of  1, the accuracy is 0.8
for depth of  2, the accuracy is 0.8
for depth of  3, the accuracy is 0.84
for depth of  4, the accuracy is 0.85
for depth of  5, the accuracy is 0.87
for depth of  6, the accuracy is 0.88
for depth of  7, the accuracy is 0.89
for depth of  8, the accuracy is 0.9
for depth of  9, the accuracy is 0.91
for depth of 10, the accuracy is 0.91
for depth of 11, the accuracy is 0.9
for depth of 12, the accuracy is 0.9
for depth of 13, the accuracy is 0.91
for depth of 14, the accuracy is 0.91
for depth of 15, the accuracy is 0.91
for depth of 16, the accuracy is 0.91
for depth of 17, the accuracy is 0.91
for depth of 18, the accuracy is 0.91
for depth of 19, the accuracy is 0.91


In [99]:
# 4 val
for x in range(1,20):
        tree = rf(max_depth=x, random_state=4343,min_samples_leaf=4)
        tree.fit(x_train, y_train)
        acc = tree.score(x_validate, y_validate)
        print(f'for depth of {x:2}, the accuracy is {round(acc,2)}')

for depth of  1, the accuracy is 0.76
for depth of  2, the accuracy is 0.78
for depth of  3, the accuracy is 0.79
for depth of  4, the accuracy is 0.79
for depth of  5, the accuracy is 0.81
for depth of  6, the accuracy is 0.81
for depth of  7, the accuracy is 0.79
for depth of  8, the accuracy is 0.82
for depth of  9, the accuracy is 0.8
for depth of 10, the accuracy is 0.8
for depth of 11, the accuracy is 0.79
for depth of 12, the accuracy is 0.79
for depth of 13, the accuracy is 0.8
for depth of 14, the accuracy is 0.8
for depth of 15, the accuracy is 0.8
for depth of 16, the accuracy is 0.8
for depth of 17, the accuracy is 0.8
for depth of 18, the accuracy is 0.8
for depth of 19, the accuracy is 0.8


In [None]:
# 5
for x in range(1,20):
        tree = rf(max_depth=x, random_state=4343,min_samples_leaf=5)
        tree.fit(x_train, y_train)
        acc = tree.score(x_train, y_train)
        print(f'for depth of {x:2}, the accuracy is {round(acc,2)}')

for depth of  1, the accuracy is 0.8
for depth of  2, the accuracy is 0.82
for depth of  3, the accuracy is 0.84
for depth of  4, the accuracy is 0.86
for depth of  5, the accuracy is 0.87
for depth of  6, the accuracy is 0.87
for depth of  7, the accuracy is 0.89
for depth of  8, the accuracy is 0.9
for depth of  9, the accuracy is 0.89
for depth of 10, the accuracy is 0.89
for depth of 11, the accuracy is 0.9
for depth of 12, the accuracy is 0.9
for depth of 13, the accuracy is 0.9
for depth of 14, the accuracy is 0.9
for depth of 15, the accuracy is 0.89
for depth of 16, the accuracy is 0.89
for depth of 17, the accuracy is 0.89
for depth of 18, the accuracy is 0.89
for depth of 19, the accuracy is 0.89


In [98]:
# 5 val
for x in range(1,20):
        tree = rf(max_depth=x, random_state=4343,min_samples_leaf=5)
        tree.fit(x_train, y_train)
        acc = tree.score(x_validate, y_validate)
        print(f'for depth of {x:2}, the accuracy is {round(acc,2)}')

for depth of  1, the accuracy is 0.76
for depth of  2, the accuracy is 0.78
for depth of  3, the accuracy is 0.79
for depth of  4, the accuracy is 0.79
for depth of  5, the accuracy is 0.81
for depth of  6, the accuracy is 0.8
for depth of  7, the accuracy is 0.79
for depth of  8, the accuracy is 0.81
for depth of  9, the accuracy is 0.79
for depth of 10, the accuracy is 0.79
for depth of 11, the accuracy is 0.8
for depth of 12, the accuracy is 0.79
for depth of 13, the accuracy is 0.79
for depth of 14, the accuracy is 0.8
for depth of 15, the accuracy is 0.79
for depth of 16, the accuracy is 0.79
for depth of 17, the accuracy is 0.79
for depth of 18, the accuracy is 0.79
for depth of 19, the accuracy is 0.79
