In [88]:
from math import sqrt
from scipy import stats
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from pydataset import data
import statistics
import acquire
import prepare

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier

import explore
import warnings
warnings.filterwarnings("ignore")

import graphviz
from graphviz import Graph

### Using the titanic data, in your classification-exercises repository, create a notebook, model.ipynb where you will do the following:

In [7]:
train, validate, test = prepare.prep_titanic_data(acquire.get_titanic_data())

In [9]:
train.head(3)

Unnamed: 0,passenger_id,survived,pclass,sex,sibsp,parch,fare,embark_town,alone,sex_female,sex_male,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
455,455,1,3,male,0,0,7.8958,Cherbourg,1,0,1,1,0,0
380,380,1,1,female,0,0,227.525,Cherbourg,1,1,0,1,0,0
492,492,0,1,male,0,0,30.5,Southampton,1,0,1,0,0,1


### What is your baseline prediction?
### What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode).

In [21]:
train.survived.value_counts()

0    329
1    205
Name: survived, dtype: int64

In [23]:
train["baseline"] = 0

In [24]:
(train.survived==train.baseline).mean()

0.6161048689138576

### When you make those predictions, what is your accuracy? This is your baseline accuracy.
    61% and yes this is my basline accuracy assuming no survivers 

### Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [10]:
train.drop(columns=['sex', 'embark_town'], inplace=True)
validate.drop(columns=['sex', 'embark_town'], inplace=True)
test.drop(columns=['sex', 'embark_town'], inplace=True)
train[""]

In [11]:
# create X & y version of train, where y is a series with just the target variable and X are all the features. 

X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [12]:
clf = DecisionTreeClassifier(max_depth=3, random_state=123)

In [13]:
clf = clf.fit(X_train, y_train)

In [14]:
y_pred = clf.predict(X_train)
y_pred[0:5]

array([0, 1, 0, 0, 0])

In [15]:
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[0:5]

array([[0.90322581, 0.09677419],
       [0.        , 1.        ],
       [0.61458333, 0.38541667],
       [0.86666667, 0.13333333],
       [0.90322581, 0.09677419]])

### Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [16]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.81


In [25]:
confusion_matrix(y_train, y_pred)

array([[293,  36],
       [ 63, 142]])

In [26]:
y_train.value_counts()

0    329
1    205
Name: survived, dtype: int64

In [27]:
labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,293,36
1,63,142


In [28]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.89      0.86       329
           1       0.80      0.69      0.74       205

    accuracy                           0.81       534
   macro avg       0.81      0.79      0.80       534
weighted avg       0.81      0.81      0.81       534



In [20]:
dot_data = export_graphviz(clf, feature_names= X_train.columns,class_names=['died','lived'], rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True)

'titanic_decision_tree.pdf'

# Run through steps 2-4 using a different max_depth value.

In [73]:
def train_model(max_depth):
    clf = DecisionTreeClassifier(max_depth=max_depth, random_state=123)
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_train)
    y_pred_proba = clf.predict_proba(X_train)
    accuracy = clf.score(X_train, y_train)
    #print(classification_report(y_train, y_pred))
    return accuracy

def validate_model(max_depth):
    clf = DecisionTreeClassifier(max_depth=max_depth, random_state=123)
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_validate)
    y_pred_proba = clf.predict_proba(X_validate)
    accuracy = clf.score(X_validate, y_validate)
    return accuracy
    #print(classification_report(y_validate, y_pred))

In [84]:
for depth in range(1,7):
    print(f'Accuracy of Decision Tree classifier in depth: {depth}: train = {round(train_model(depth), 3)} validate = {round(validate_model(depth), 3)}')

Accuracy of Decision Tree classifier in depth: 1: train = 0.792 validate = 0.775
Accuracy of Decision Tree classifier in depth: 2: train = 0.792 validate = 0.775
Accuracy of Decision Tree classifier in depth: 3: train = 0.815 validate = 0.787
Accuracy of Decision Tree classifier in depth: 4: train = 0.826 validate = 0.787
Accuracy of Decision Tree classifier in depth: 5: train = 0.843 validate = 0.736
Accuracy of Decision Tree classifier in depth: 6: train = 0.878 validate = 0.781


### Which model performs better on your in-sample data?
    Max Depth 6 worked better in my sample data

### Which model performs best on your out-of-sample data, the validate set?
    Max Depth 2 was the closest accuracy to my train set

# Random Forest Exercises

### Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [90]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=100,
                            max_depth=10, 
                            random_state=123)

In [91]:
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=10, random_state=123)

### Evaluate your results using the model score, confusion matrix, and classification report.

In [96]:
y_pred = rf.predict(X_train)
y_pred[0:5]

array([0, 1, 0, 1, 0])

In [100]:
y_pred_proba = rf.predict_proba(X_train)
y_pred_proba

array([[0.62394629, 0.37605371],
       [0.01      , 0.99      ],
       [0.71781496, 0.28218504],
       ...,
       [0.92561887, 0.07438113],
       [0.96809372, 0.03190628],
       [0.02      , 0.98      ]])

In [101]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.97


In [102]:
print(confusion_matrix(y_train, y_pred))

[[329   0]
 [ 15 190]]


In [103]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       329
           1       1.00      0.93      0.96       205

    accuracy                           0.97       534
   macro avg       0.98      0.96      0.97       534
weighted avg       0.97      0.97      0.97       534



In [104]:
print('Accuracy of random forest classifier on test set: {:.2f}'
     .format(rf.score(X_validate, y_validate)))

Accuracy of random forest classifier on test set: 0.77


In [114]:
def forest_train(leaf, depth):
    rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=leaf,
                            n_estimators=100,
                            max_depth=depth, 
                            random_state=123)
    rf.fit(X_train, y_train)
    accuracy = rf.score(X_train, y_train)
    return round(accuracy, 3)

def forest_validate(leaf, depth):
    rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=leaf,
                            n_estimators=100,
                            max_depth=depth, 
                            random_state=123)
    rf.fit(X_train, y_train)
    accuracy = rf.score(X_validate, y_validate)
    return round(accuracy, 3)

In [117]:
for leaf in range(1,11):
    for depth in range(1,11):
        print(f"For leaf: {leaf} and depth: {depth} accuracy is: {forest_train(leaf, depth)}")
        print(f"For leaf: {leaf} and depth: {depth} accuracy is: {forest_validate(leaf, depth)}")
        print(f"Difference is: {forest_train(leaf, depth) - forest_validate(leaf, depth)}")

For leaf: 1 and depth: 1 accuracy is: 0.794
For leaf: 1 and depth: 1 accuracy is: 0.77
Difference is: 0.02400000000000002
For leaf: 1 and depth: 2 accuracy is: 0.792
For leaf: 1 and depth: 2 accuracy is: 0.775
Difference is: 0.017000000000000015
For leaf: 1 and depth: 3 accuracy is: 0.813
For leaf: 1 and depth: 3 accuracy is: 0.787
Difference is: 0.025999999999999912
For leaf: 1 and depth: 4 accuracy is: 0.837
For leaf: 1 and depth: 4 accuracy is: 0.798
Difference is: 0.038999999999999924
For leaf: 1 and depth: 5 accuracy is: 0.854
For leaf: 1 and depth: 5 accuracy is: 0.798
Difference is: 0.05599999999999994
For leaf: 1 and depth: 6 accuracy is: 0.899
For leaf: 1 and depth: 6 accuracy is: 0.787
Difference is: 0.11199999999999999
For leaf: 1 and depth: 7 accuracy is: 0.929
For leaf: 1 and depth: 7 accuracy is: 0.787
Difference is: 0.14200000000000002
For leaf: 1 and depth: 8 accuracy is: 0.949
For leaf: 1 and depth: 8 accuracy is: 0.792
Difference is: 0.15699999999999992
For leaf: 1 an

Difference is: 0.027999999999999914
For leaf: 7 and depth: 8 accuracy is: 0.835
For leaf: 7 and depth: 8 accuracy is: 0.803
Difference is: 0.03199999999999992
For leaf: 7 and depth: 9 accuracy is: 0.837
For leaf: 7 and depth: 9 accuracy is: 0.809
Difference is: 0.027999999999999914
For leaf: 7 and depth: 10 accuracy is: 0.837
For leaf: 7 and depth: 10 accuracy is: 0.803
Difference is: 0.03399999999999992
For leaf: 8 and depth: 1 accuracy is: 0.794
For leaf: 8 and depth: 1 accuracy is: 0.77
Difference is: 0.02400000000000002
For leaf: 8 and depth: 2 accuracy is: 0.792
For leaf: 8 and depth: 2 accuracy is: 0.775
Difference is: 0.017000000000000015
For leaf: 8 and depth: 3 accuracy is: 0.805
For leaf: 8 and depth: 3 accuracy is: 0.775
Difference is: 0.030000000000000027
For leaf: 8 and depth: 4 accuracy is: 0.822
For leaf: 8 and depth: 4 accuracy is: 0.798
Difference is: 0.02399999999999991
For leaf: 8 and depth: 5 accuracy is: 0.841
For leaf: 8 and depth: 5 accuracy is: 0.809
Difference 