# Decision Tree Exercises

#### Using Titanic Data
- Remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.tree import export_graphviz

from sklearn.metrics import confusion_matrix
import graphviz
from graphviz import Graph
import prepare

In [2]:
# Get Titanic Data
df = prepare.prep_titanic()
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,sibsp,parch,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,0,3,male,1,0,Southampton,0,1,0,1
1,1,1,1,female,1,0,Cherbourg,0,0,0,0
2,2,1,3,female,0,0,Southampton,1,0,0,1
3,3,1,1,female,1,0,Southampton,0,0,0,1
4,4,0,3,male,0,0,Southampton,1,1,0,1


In [3]:
drop_cols = ['sex','embark_town', 'passenger_id']
df = df.drop(columns=drop_cols)
df.head()

Unnamed: 0,survived,pclass,sibsp,parch,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,3,1,0,0,1,0,1
1,1,1,1,0,0,0,0,0
2,1,3,0,0,1,0,0,1
3,1,1,1,0,0,0,0,1
4,0,3,0,0,1,1,0,1


In [4]:
#Split Data
train, validate, test = prepare.split_titanic(df)
train.shape, validate.shape, test.shape

((498, 8), (214, 8), (179, 8))

## <font color = 'red'> 1a) What is your baseline prediction?

In [5]:
# Find most often used case in survived
train.survived.value_counts() #most frequent is not_survived

0    307
1    191
Name: survived, dtype: int64

In [6]:
### Baseline Prediction to 0 or Not_Survived

## <font color = 'red'> 1a) What is your baseline accuracy?

### ??? Removed column for baseline because it threw off the validation

### Baseline accuracy is 61.6%

## <font color = 'red'>2) Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [7]:
# Create X & y version of train, where y is a series with just the target variable and X are all the features. 

X_train = train.drop(columns='survived')
y_train = train.survived

X_validate = validate.drop(columns='survived')
y_validate = validate.survived

X_test = test.drop(columns='survived')
y_test = test.survived

In [8]:
X_train.shape

(498, 7)

In [9]:
X_validate.shape

(214, 7)

In [10]:
# Create the Decision Tree Object
clf = DecisionTreeClassifier(max_depth=3, random_state=123)
clf

DecisionTreeClassifier(max_depth=3, random_state=123)

In [11]:
# Fit the Model
clf = clf.fit(X_train, y_train)
clf

DecisionTreeClassifier(max_depth=3, random_state=123)

In [12]:
dot_data = export_graphviz(clf, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True)

'titanic_decision_tree.pdf'

In [13]:
# make prediction on train obeservations

y_pred = clf.predict(X_train)
y_pred[0:5]

array([0, 0, 0, 1, 1])

In [14]:
# provide probabiblity on train observations
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[0:5]

array([[0.62686567, 0.37313433],
       [0.74285714, 0.25714286],
       [0.74285714, 0.25714286],
       [0.03703704, 0.96296296],
       [0.03703704, 0.96296296]])

## <font color = 'red' >3) Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [15]:
# Computer Accuracy Score
clf.score(X_train, y_train)

0.8192771084337349

In [16]:
# Create Confusion Matrix
confusion_matrix(y_train, y_pred)

array([[294,  13],
       [ 77, 114]])

In [17]:
y_train.value_counts()

0    307
1    191
Name: survived, dtype: int64

In [18]:
labels = sorted(y_train.unique())
print('Actual on the left, predicted on the top')
pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Actual on the left, predicted on the top


Unnamed: 0,0,1
0,294,13
1,77,114


In [19]:
# Create Classification Report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.96      0.87       307
           1       0.90      0.60      0.72       191

    accuracy                           0.82       498
   macro avg       0.85      0.78      0.79       498
weighted avg       0.83      0.82      0.81       498



## <font color = 'red'>4) Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support

In [20]:
# Find true positive rate, false positive rate, true negative rate, false negative rate
# Compute TPR,TNR,FPR,FNR
TP = 294
TN = 114
FP = 77
FN = 13
ALL = TP + TN + FP + FN

accuracy = (TP + TN) / ALL
TPR = TP / (TP + FN) 
TNR = TN / (TN + FP) 
FNR = FN / (FN + TP)  
FPR = FP / (FP + TN) 
precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2* (precision*recall) / (precision+recall)
support_pos = TP + FN
support_neg = TN + FP


print(f'Accuracy: {accuracy}')
print(f'True Positive Rate: {TPR}')
print(f'True Negative Rate: {TNR}')
print(f'False Positive Rate: {FPR}')
print(f'False Negative Rate: {FNR}')
print(f'Precision: {precision}')
print(f'recall: {recall}')
print(f'support_pos: {support_pos}')
print(f'support_neg: {support_neg}')


Accuracy: 0.8192771084337349
True Positive Rate: 0.9576547231270358
True Negative Rate: 0.5968586387434555
False Positive Rate: 0.4031413612565445
False Negative Rate: 0.04234527687296417
Precision: 0.7924528301886793
recall: 0.9576547231270358
support_pos: 307
support_neg: 191


## <font color = 'red'>5) Run through steps 2-4 using a different max_depth value.

### <font color = 'red'> Using Max_Depth 4

In [21]:
# Create the Decision Tree Object increasing max_depth to 4
clf = DecisionTreeClassifier(max_depth=4, random_state=123)
# Fit the Model
clf = clf.fit(X_train, y_train)

In [22]:
# make prediction on train obeservations
y_pred = clf.predict(X_train)

In [23]:
# find probability of train observations
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[0:5]

array([[0.66666667, 0.33333333],
       [0.86956522, 0.13043478],
       [0.86956522, 0.13043478],
       [0.        , 1.        ],
       [0.05      , 0.95      ]])

In [24]:
# Compute Accuracy
clf.score(X_train, y_train)

0.821285140562249

In [25]:
# Create Confusion Matrix
confusion_matrix(y_train, y_pred)

array([[288,  19],
       [ 70, 121]])

In [26]:
y_train.value_counts()

0    307
1    191
Name: survived, dtype: int64

In [27]:
labels = sorted(y_train.unique())
print('Actual on the left, predicted on the top')
pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Actual on the left, predicted on the top


Unnamed: 0,0,1
0,288,19
1,70,121


In [28]:
# Create classification report to find precision, reall, f1-score, support
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.94      0.87       307
           1       0.86      0.63      0.73       191

    accuracy                           0.82       498
   macro avg       0.83      0.79      0.80       498
weighted avg       0.83      0.82      0.81       498



In [29]:
# Find true positive rate, false positive rate, true negative rate, false negative rate
# Compute TPR,TNR,FPR,FNR
TP = 298
TN = 116
FP = 75
FN = 9

TPR = TP / (TP + FN) 
TNR = TN / (TN + FP) 
FNR = FN / (FN + TP)  
FPR = FP / (FP + TN) 


print(f'True Positive Rate: {TPR}')
print(f'True Negative Rate: {TNR}')
print(f'False Positive Rate: {FPR}')
print(f'False Negative Rate: {FNR}')

True Positive Rate: 0.9706840390879479
True Negative Rate: 0.6073298429319371
False Positive Rate: 0.39267015706806285
False Negative Rate: 0.029315960912052116


### <font color = 'red'> Using Max_Depth 5

In [30]:
# Create the Decision Tree Object increasing max_depth to 5
clf = DecisionTreeClassifier(max_depth=5, random_state=123)
# Fit the Model
clf = clf.fit(X_train, y_train)
# make prediction on train obeservations
y_pred = clf.predict(X_train)
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.91      0.87       307
           1       0.82      0.71      0.76       191

    accuracy                           0.83       498
   macro avg       0.83      0.81      0.82       498
weighted avg       0.83      0.83      0.83       498



In [31]:
# Compute Accuracy
clf.score(X_train, y_train)

0.8313253012048193

In [32]:
labels = sorted(y_train.unique())
print('Actual on the left, predicted on the top')
pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Actual on the left, predicted on the top


Unnamed: 0,0,1
0,278,29
1,55,136


In [33]:
# Find true positive rate, false positive rate, true negative rate, false negative rate
# Compute TPR,TNR,FPR,FNR
TP = 291
TN = 131
FP = 60
FN = 16

TPR = TP / (TP + FN) 
TNR = TN / (TN + FP) 
FNR = FN / (FN + TP)  
FPR = FP / (FP + TN) 


print(f'True Positive Rate: {TPR}')
print(f'True Negative Rate: {TNR}')
print(f'False Positive Rate: {FPR}')
print(f'False Negative Rate: {FNR}')

True Positive Rate: 0.9478827361563518
True Negative Rate: 0.6858638743455497
False Positive Rate: 0.31413612565445026
False Negative Rate: 0.05211726384364821


### <font color = 'red'> Using Max_Depth 6

In [34]:
# Create the Decision Tree Object increasing max_depth to 6
clf = DecisionTreeClassifier(max_depth=6, random_state=123)
# Fit the Model
clf = clf.fit(X_train, y_train)
# make prediction on train obeservations
y_pred = clf.predict(X_train)
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.96      0.89       307
           1       0.91      0.69      0.78       191

    accuracy                           0.85       498
   macro avg       0.87      0.82      0.84       498
weighted avg       0.86      0.85      0.85       498



In [35]:
# Compute Accuracy
clf.score(X_train, y_train)

0.8534136546184738

In [36]:
labels = sorted(y_train.unique())
print('Actual on the left, predicted on the top')
pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Actual on the left, predicted on the top


Unnamed: 0,0,1
0,294,13
1,60,131


In [37]:
# Find true positive rate, false positive rate, true negative rate, false negative rate
# Compute TPR,TNR,FPR,FNR
TP = 282
TN = 151
FP = 40
FN = 25

TPR = TP / (TP + FN) 
TNR = TN / (TN + FP) 
FNR = FN / (FN + TP)  
FPR = FP / (FP + TN) 


print(f'True Positive Rate: {TPR}')
print(f'True Negative Rate: {TNR}')
print(f'False Positive Rate: {FPR}')
print(f'False Negative Rate: {FNR}')

True Positive Rate: 0.9185667752442996
True Negative Rate: 0.7905759162303665
False Positive Rate: 0.2094240837696335
False Negative Rate: 0.08143322475570032


In [38]:
for i in range(2,20):
    # Create the Decision Tree Object increasing max_depth to 6
    clf = DecisionTreeClassifier(max_depth=i, random_state=123)
    
    # Fit the Model
    clf = clf.fit(X_train, y_train)
    
    # make prediction on train obeservations
    y_pred = clf.predict(X_train)
    
    # Produce the classification report on the actual y values and this model's predicted y values
    report = classification_report(y_train, y_pred, output_dict=True)
    print(f"Tree with max depth of {i}")
    print(pd.DataFrame(report))
    print()

Tree with max depth of 2
                    0           1  accuracy   macro avg  weighted avg
precision    0.820433    0.760000  0.799197    0.790217      0.797255
recall       0.863192    0.696335  0.799197    0.779764      0.799197
f1-score     0.841270    0.726776  0.799197    0.784023      0.797358
support    307.000000  191.000000  0.799197  498.000000    498.000000

Tree with max depth of 3
                    0           1  accuracy   macro avg  weighted avg
precision    0.792453    0.897638  0.819277    0.845045      0.832795
recall       0.957655    0.596859  0.819277    0.777257      0.819277
f1-score     0.867257    0.716981  0.819277    0.792119      0.809621
support    307.000000  191.000000  0.819277  498.000000    498.000000

Tree with max depth of 4
                    0           1  accuracy   macro avg  weighted avg
precision    0.804469    0.864286  0.821285    0.834377      0.827411
recall       0.938111    0.633508  0.821285    0.785809      0.821285
f1-score     

## <font color='red'>6) Which model performs better on your in-sample data?

In [39]:
# Model using Max Depth 6 because it has the best accuracy

## <font color = 'red'>7) Which model performs best on your out-of-sample data, the validate set?

In [40]:
# Create the Decision Tree Object increasing max_depth to 3
clf = DecisionTreeClassifier(max_depth=3, random_state=123)
# Fit the Model
clf = clf.fit(X_train, y_train)
# make prediction on train obeservations
y_pred = clf.predict(X_validate)

print(clf.score(X_validate, y_validate))


0.794392523364486


In [41]:
X_train.shape

(498, 7)

In [42]:
X_validate.shape

(214, 7)

In [43]:
# Create the Decision Tree Object increasing max_depth to 4
clf = DecisionTreeClassifier(max_depth=4, random_state=123)
# Fit the Model
clf = clf.fit(X_train, y_train)

print(clf.score(X_validate, y_validate))
print(classification_report(y_validate, y_pred))

0.794392523364486
              precision    recall  f1-score   support

           0       0.77      0.95      0.85       132
           1       0.88      0.54      0.67        82

    accuracy                           0.79       214
   macro avg       0.82      0.75      0.76       214
weighted avg       0.81      0.79      0.78       214



In [44]:
# Create the Decision Tree Object increasing max_depth to 5
clf = DecisionTreeClassifier(max_depth=5, random_state=123)
# Fit the Model
clf = clf.fit(X_train, y_train)

print(clf.score(X_validate, y_validate))
print(classification_report(y_validate, y_pred))

0.7663551401869159
              precision    recall  f1-score   support

           0       0.77      0.95      0.85       132
           1       0.88      0.54      0.67        82

    accuracy                           0.79       214
   macro avg       0.82      0.75      0.76       214
weighted avg       0.81      0.79      0.78       214



In [45]:
# Create the Decision Tree Object increasing max_depth to 6
clf = DecisionTreeClassifier(max_depth=6, random_state=123)
# Fit the Model
clf = clf.fit(X_train, y_train)

print(clf.score(X_validate, y_validate))
print(classification_report(y_validate, y_pred))

0.7710280373831776
              precision    recall  f1-score   support

           0       0.77      0.95      0.85       132
           1       0.88      0.54      0.67        82

    accuracy                           0.79       214
   macro avg       0.82      0.75      0.76       214
weighted avg       0.81      0.79      0.78       214



In [46]:
# Let's continue getting loopy, so we can compare in-sample to out-of-sample
metrics = []

for i in range(2, 25):
    # Make the model
    clf = DecisionTreeClassifier(max_depth=i, random_state=123)

    # Fit the model (on train and only train)
    clf = clf.fit(X_train, y_train)

    # Use the model
    # We'll evaluate the model's performance on train, first
    in_sample_accuracy = clf.score(X_train, y_train)
    
    out_of_sample_accuracy = clf.score(X_validate, y_validate)

    output = {
        "max_depth": i,
        "train_accuracy": in_sample_accuracy,
        "validate_accuracy": out_of_sample_accuracy
    }
    
    metrics.append(output)
    
df = pd.DataFrame(metrics)
df["difference"] = df.train_accuracy - df.validate_accuracy
df

Unnamed: 0,max_depth,train_accuracy,validate_accuracy,difference
0,2,0.799197,0.761682,0.037515
1,3,0.819277,0.794393,0.024885
2,4,0.821285,0.794393,0.026893
3,5,0.831325,0.766355,0.06497
4,6,0.853414,0.771028,0.082386
5,7,0.85743,0.761682,0.095747
6,8,0.859438,0.757009,0.102428
7,9,0.859438,0.757009,0.102428
8,10,0.859438,0.757009,0.102428
9,11,0.859438,0.757009,0.102428


### Model with Max_Depth 6 has the best F1-Scores

## <font color = 'red'>8) Work through these same exercises using the Telco dataset.

In [47]:
import prepare

In [48]:
df = prepare.prep_telco()
drop_cols = ['customer_id',
 'gender','partner',
 'dependents','phone_service',
 'multiple_lines',
 'online_security',
 'online_backup',
 'device_protection',
 'tech_support',
 'streaming_tv',
 'streaming_movies',
 'paperless_billing','contract_type',
 'internet_service_type',
 'payment_type']
df = df.drop(columns=drop_cols)
df

Unnamed: 0,senior_citizen,tenure,monthly_charges,total_charges,churn,senior_citizen.1,gender_Male,partner_Yes,dependents_Yes,phone_service_Yes,...,streaming_movies_Yes,paperless_billing_Yes,churn_Yes,contract_type_One year,contract_type_Two year,internet_service_type_Fiber optic,internet_service_type_None,payment_type_Credit card (automatic),payment_type_Electronic check,payment_type_Mailed check
0,0,65,90.45,5957.90,No,0,0,1,1,1,...,1,1,0,0,1,0,0,0,0,1
1,0,54,45.20,2460.55,No,0,1,0,0,0,...,0,0,0,0,1,0,0,1,0,0
2,0,56,45.05,2560.10,No,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,20,39.40,825.40,No,0,1,1,1,0,...,0,1,0,0,1,0,0,1,0,0
4,0,72,85.15,6316.20,No,0,1,1,0,1,...,1,1,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,1,20.05,20.05,No,0,0,1,1,1,...,0,0,0,0,0,0,1,0,0,1
7039,0,19,19.90,367.55,No,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,1
7040,0,6,19.70,129.55,No,0,0,0,0,1,...,0,0,0,0,0,0,1,1,0,0
7041,0,1,18.90,18.90,No,0,1,0,0,1,...,0,0,0,0,0,0,1,0,0,1


# <font color='red'> Random Forest

##  <font color='red'> 1) Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [49]:
# Create the Model's Object
rf = RandomForestClassifier(max_depth=10,
                           random_state=123,
                           min_samples_leaf=1)
rf

RandomForestClassifier(max_depth=10, random_state=123)

In [50]:
#Fit the Data to the model
rf = rf.fit(X_train,y_train)

In [52]:
# Get Feature (column) Importances. sex_male (48%) has the most importance
print(rf.feature_importances_)

[0.20087185 0.12628004 0.091555   0.03498795 0.47979937 0.02186451
 0.04464129]


In [53]:
# Make Predictions
y_pred = rf.predict(X_train)
y_pred

array([0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [54]:
# Find Probabability of each Prediction
y_pred_proba = rf.predict_proba(X_train)
y_pred_proba

array([[7.53151354e-01, 2.46848646e-01],
       [2.80579574e-01, 7.19420426e-01],
       [9.76000000e-01, 2.40000000e-02],
       [0.00000000e+00, 1.00000000e+00],
       [9.35694720e-02, 9.06430528e-01],
       [8.66036214e-01, 1.33963786e-01],
       [3.25888889e-01, 6.74111111e-01],
       [8.75629507e-01, 1.24370493e-01],
       [9.32874395e-01, 6.71256048e-02],
       [1.00000000e+00, 0.00000000e+00],
       [2.43932540e-01, 7.56067460e-01],
       [5.93532127e-01, 4.06467873e-01],
       [0.00000000e+00, 1.00000000e+00],
       [5.93532127e-01, 4.06467873e-01],
       [8.46741730e-01, 1.53258270e-01],
       [8.99012579e-01, 1.00987421e-01],
       [8.99012579e-01, 1.00987421e-01],
       [2.32714357e-01, 7.67285643e-01],
       [8.75629507e-01, 1.24370493e-01],
       [6.45130592e-01, 3.54869408e-01],
       [2.43932540e-01, 7.56067460e-01],
       [8.99012579e-01, 1.00987421e-01],
       [0.00000000e+00, 1.00000000e+00],
       [0.00000000e+00, 1.00000000e+00],
       [5.935321

##  <font color='red'> 2) Evaluate your results using the model score, confusion matrix, and classification report.

In [55]:
# Find Model Score for Accuracy
rf.score(X_train,y_train)

0.8594377510040161

In [60]:
# Make Confusion Matrix
labels = sorted(y_train.unique())
print('Actual on the left, predicted on the top')
pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Actual on the left, predicted on the top


Unnamed: 0,0,1
0,293,14
1,56,135


In [62]:
print(classification_report(y_train,y_pred))

              precision    recall  f1-score   support

           0       0.84      0.95      0.89       307
           1       0.91      0.71      0.79       191

    accuracy                           0.86       498
   macro avg       0.87      0.83      0.84       498
weighted avg       0.87      0.86      0.86       498



In [64]:
# Produce the classification report on the actual y values and this model's predicted y values
report = classification_report(y_train, y_pred, output_dict=True)
pd.DataFrame(report)

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.839542,0.90604,0.859438,0.872791,0.865046
recall,0.954397,0.706806,0.859438,0.830602,0.859438
f1-score,0.893293,0.794118,0.859438,0.843705,0.855256
support,307.0,191.0,0.859438,498.0,498.0


##  <font color='red'> 3) Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [65]:
TP = 293
FP = 56
FN = 14
TN = 135
ALL = TP + FP + FN + TN

accuracy = (TP + TN)/ALL
print(f"Accuracy: {accuracy}")

true_positive_rate = TP/(TP+FN)
print(f"True Positive Rate: {true_positive_rate}")

false_positive_rate = FP/(FP+TN)
print(f"False Positive Rate: {false_positive_rate}")

true_negative_rate = TN/(TN+FP)
print(f"True Negative Rate: {true_negative_rate}")

false_negative_rate = FN/(FN+TP)
print(f"False Negative Rate: {false_negative_rate}")

precision = TP/(TP+FP)
print(f"Precision: {precision}")

recall = TP/(TP+FN)
print(f"Recall: {recall}")

f1_score = 2*(precision*recall)/(precision+recall)
print(f"F1 Score: {f1_score}")

support_pos = TP + FN
print(f"Support (0): {support_pos}")

support_neg = FP + TN
print(f"Support (1): {support_neg}")

Accuracy: 0.8594377510040161
True Positive Rate: 0.9543973941368078
False Positive Rate: 0.2931937172774869
True Negative Rate: 0.7068062827225131
False Negative Rate: 0.04560260586319218
Precision: 0.839541547277937
Recall: 0.9543973941368078
F1 Score: 0.8932926829268293
Support (0): 307
Support (1): 191


##  <font color='red'> 4) Run through steps increasing your min_samples_leaf and decreasing your max_depth

In [82]:
for i in range(1,11):
    rf = RandomForestClassifier(max_depth=11-i,
                           random_state=123,
                           min_samples_leaf=i)
    rf = rf.fit(X_train,y_train)
    y_pred = rf.predict(X_train)
    report = classification_report(y_train, y_pred, output_dict=True)
    print(f"Random Forest with max_depth of {11-i} and min_sample_leaf {i}")
    print(pd.DataFrame(report))
    print()

Random Forest with max_depth of 10 and min_sample_leaf 1
                    0           1  accuracy   macro avg  weighted avg
precision    0.839542    0.906040  0.859438    0.872791      0.865046
recall       0.954397    0.706806  0.859438    0.830602      0.859438
f1-score     0.893293    0.794118  0.859438    0.843705      0.855256
support    307.000000  191.000000  0.859438  498.000000    498.000000

Random Forest with max_depth of 9 and min_sample_leaf 2
                    0           1  accuracy   macro avg  weighted avg
precision    0.819209    0.881944  0.837349    0.850577      0.843270
recall       0.944625    0.664921  0.837349    0.804773      0.837349
f1-score     0.877458    0.758209  0.837349    0.817834      0.831722
support    307.000000  191.000000  0.837349  498.000000    498.000000

Random Forest with max_depth of 8 and min_sample_leaf 3
                    0           1  accuracy   macro avg  weighted avg
precision    0.820513    0.870748  0.835341    0.845631    

##  <font color='red'> 5a) What are the differences in the evaluation metrics? 

???

##  <font color='red'> 5a)Which performs better on your in-sample data? Why?

Random Forest with max_depth of 8 and min_sample_leaf 3 performs best. ???

##  <font color='red'> 6) After making a few models, which one has the best performance (or closest metrics) on both train and validate?

In [83]:
# compare in-sample to out-of-sample
metrics = []

for i in range(1,11):
    # Make the model
    rf = RandomForestClassifier(max_depth=11-i,
                           random_state=123,
                           min_samples_leaf=i)

    # Fit the model (on train and only train)
    rf = rf.fit(X_train, y_train)
    
    # Use the model
    # We'll evaluate the model's performance on train, first
    in_sample_accuracy = rf.score(X_train, y_train)
    
    out_of_sample_accuracy = rf.score(X_validate, y_validate)

    output = {
        "max_depth": 11-i,
        "min_sample": i,
        "train_accuracy": in_sample_accuracy,
        "validate_accuracy": out_of_sample_accuracy
    }
    
    metrics.append(output)
    
df = pd.DataFrame(metrics)
df["difference"] = df.train_accuracy - df.validate_accuracy
df

Unnamed: 0,max_depth,min_sample,train_accuracy,validate_accuracy,difference
0,10,1,0.859438,0.757009,0.102428
1,9,2,0.837349,0.794393,0.042957
2,8,3,0.835341,0.799065,0.036276
3,7,4,0.833333,0.794393,0.038941
4,6,5,0.833333,0.794393,0.038941
5,5,6,0.827309,0.794393,0.032917
6,4,7,0.823293,0.794393,0.028901
7,3,8,0.825301,0.78972,0.035582
8,2,9,0.801205,0.78972,0.011485
9,1,10,0.73494,0.719626,0.015314


### model with max_depth = 2, min_sample = 9