# Modeling Exercises

In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from pydataset import data
import acquire
import prepare

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from env import host, user, password

In [2]:
from sklearn.model_selection import train_test_split

def train_validate_test_split(df, target, seed=123):
    '''
    This function takes in a dataframe, the name of the target variable
    (for stratification purposes), and an integer for a setting a seed
    and splits the data into train, validate and test. 
    Test is 20% of the original dataset, validate is .30*.80= 24% of the 
    original dataset, and train is .70*.80= 56% of the original dataset. 
    The function returns, in this order, train, validate and test dataframes. 
    '''
    train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=seed, 
                                            stratify=df[target])
    train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=seed,
                                       stratify=train_validate[target])
    return train, validate, test

## Decision Tree

### Using the titanic data, in your classification-exercises repository, create a notebook, model.ipynb where you will do the following:

In [3]:
from acquire import get_titanic_data

In [4]:
titanic = get_titanic_data()
titanic.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [5]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    891 non-null    int64  
 1   passenger_id  891 non-null    int64  
 2   survived      891 non-null    int64  
 3   pclass        891 non-null    int64  
 4   sex           891 non-null    object 
 5   age           714 non-null    float64
 6   sibsp         891 non-null    int64  
 7   parch         891 non-null    int64  
 8   fare          891 non-null    float64
 9   embarked      889 non-null    object 
 10  class         891 non-null    object 
 11  deck          203 non-null    object 
 12  embark_town   889 non-null    object 
 13  alone         891 non-null    int64  
dtypes: float64(2), int64(7), object(5)
memory usage: 97.6+ KB


In [6]:
#Preapare the data: Drop columns 
columns_to_drop = ['Unnamed: 0','pclass', 'embark_town', 'embarked', 'deck',  'parch', 'passenger_id', 'age']
titanic.drop(columns=columns_to_drop, inplace=True)

In [7]:
titanic.head()

Unnamed: 0,survived,sex,sibsp,fare,class,alone
0,0,male,1,7.25,Third,0
1,1,female,1,71.2833,First,0
2,1,female,0,7.925,Third,1
3,1,female,1,53.1,First,0
4,0,male,0,8.05,Third,1


In [8]:
#Decode survived ==> This is the target variable and targets do not need to be encoded for classification decision trees
titanic.rename(columns={'survived':"survival"}, inplace=True)
titanic['survived'] = titanic.survival.apply(lambda n: 'survived' if n > 0 else 'died')

In [9]:
#Encode Class and Sex as these are features we want to explore. The additional features of fare, alone, sibsp are already encoded. 
dummies = pd.get_dummies(titanic[['class', 'sex']], drop_first=[True])

In [10]:
#Concat dummies df with Titanic 
titanic = pd.concat([titanic, dummies], axis=1)

In [11]:
#drop additional columns after encoding
titanic.drop(columns={'sex','class','survival'}, inplace=True)

In [12]:
#split data into train, validate, and test
#stratidfy by survived 
train, validate, test = train_validate_test_split(titanic, target='survived', seed=123)

X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

### 1. What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [13]:
#target == survived
#features == sex, class, fare, alone, siblings

In [14]:
#Determine the baseline prediction by finding the mode for survived
train.survived.value_counts()

died        307
survived    191
Name: survived, dtype: int64

In [15]:
#Establish baseline prediction
train['baseline_prediction'] = 'died'

In [16]:
#Calculate baseline accuracy
baseline_accuracy = (train.baseline_prediction == train.survived).mean()
baseline_accuracy

0.6164658634538153

### 2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [17]:
#Set-up Model 1 Decision Tree with a max depth of 2
clf = DecisionTreeClassifier(max_depth=2, random_state=123)

In [18]:
#Fit the decision tree to training data
clf = clf.fit(X_train, y_train)

In [19]:
#Check labels for class names
clf.classes_

array(['died', 'survived'], dtype=object)

In [20]:
#Visualize decision tree: 

#import graphviz
#from graphviz import Graph
#
#dot_data = export_graphviz(clf, feature_names= X_train.columns, rounded=True, filled=True, out_file=None, class_names=clf.classes_)
#graph = graphviz.Source(dot_data) 
#
#graph.render('titanic_decision_tree', view=True)

In [21]:
#Peek at the first 5 predictions from model 1
y_pred = clf.predict(X_train)
y_pred[0:5]

array(['died', 'died', 'died', 'survived', 'survived'], dtype=object)

In [22]:
#Compare the predictions against the actuals
train.head()

Unnamed: 0,sibsp,fare,alone,survived,class_Second,class_Third,sex_male,baseline_prediction
583,0,40.125,1,died,0,0,1,died
165,0,20.525,0,survived,0,1,1,died
50,4,39.6875,0,died,0,1,1,died
259,0,26.0,0,survived,1,0,0,died
306,0,110.8833,1,survived,0,0,0,died


In [23]:
#Calculate the probabilities for each class
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[0:5]

array([[0.68644068, 0.31355932],
       [0.68644068, 0.31355932],
       [0.68644068, 0.31355932],
       [0.04255319, 0.95744681],
       [0.04255319, 0.95744681]])

### 3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [24]:
#Model Score: 
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.80


In [25]:
#Evealuate with Cofusion Matrix
confusion_matrix(y_train, y_pred)

array([[265,  42],
       [ 58, 133]])

In [26]:
labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,died,survived
died,265,42
survived,58,133


In [27]:
#Evaluate with Classififcation Report
model_1_report = classification_report(y_train, y_pred)
print(model_1_report)

              precision    recall  f1-score   support

        died       0.82      0.86      0.84       307
    survived       0.76      0.70      0.73       191

    accuracy                           0.80       498
   macro avg       0.79      0.78      0.78       498
weighted avg       0.80      0.80      0.80       498



### 4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support

In [28]:
#Accuracy
print(f'Model has an accuracy score of {accuracy_score(y_pred, y_train):.2%}')

Model has an accuracy score of 79.92%


In [29]:
#Rates from confusions matrix 
#Where dead is being treated as the positive class and survived is the negative class

sample_count = len(train)

tp = 265
fp = 58
fn = 42
tn = 133

print(f'True Positive Rate is: {tp/sample_count:.2%}')
print(f'False Positive Rate is: {fp/sample_count:.2%}')
print(f'True Negative Rate is: {tn/sample_count:.2%}')
print(f'False Negative Rate is: {fn/sample_count:.2%}')

True Positive Rate is: 53.21%
False Positive Rate is: 11.65%
True Negative Rate is: 26.71%
False Negative Rate is: 8.43%


In [30]:
#Precision
precision_score(y_train, y_pred, labels=['survived', 'died'], average=None)

array([0.76      , 0.82043344])

In [31]:
#Recall
recall_score(y_train, y_pred, labels=['survived', 'died'], average=None)

array([0.69633508, 0.86319218])

In [32]:
#f1-score
f1_score(y_train, y_pred, labels=['survived', 'died'], average=None)

array([0.72677596, 0.84126984])

In [33]:
#Support
precision_recall_fscore_support(y_train, y_pred, labels=['survived', 'died'], average=None)[3]

array([191, 307])

### 5. Run through steps 2-4 using a different max_depth value.

In [34]:
#Set-up Model 2 using a decision treet with a max depth of 3 
clf_2 = DecisionTreeClassifier(max_depth=3, random_state=123)

In [35]:
#Fit the training data to Model 2 
clf_2 = clf_2.fit(X_train, y_train)

In [36]:
#Check labels
clf_2.classes_

array(['died', 'survived'], dtype=object)

In [37]:
#Visualize the decision tree:

#dot_data = export_graphviz(clf_2, feature_names= X_train.columns, rounded=True, filled=True, out_file=None, class_names=clf_2.classes_)
#graph = graphviz.Source(dot_data) 
#
#graph.render('titanic_2_decision_tree', view=True)

In [38]:
#Peek at the first 5 predictions from model 2 
y_pred_2 = clf_2.predict(X_train)
y_pred_2[0:5]

array(['died', 'died', 'died', 'survived', 'survived'], dtype=object)

In [39]:
#Calculate the probabilities for each class
y_pred_proba = clf_2.predict_proba(X_train)
y_pred_proba[0:5]

array([[0.69827586, 0.30172414],
       [0.69827586, 0.30172414],
       [0.69827586, 0.30172414],
       [0.07142857, 0.92857143],
       [0.01923077, 0.98076923]])

#### Model 2 Evaluation

In [40]:
#Model 2 Score: 
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf_2.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.82


In [41]:
#Evealuate with Cofusion Matrix
confusion_matrix(y_train, y_pred_2)

array([[276,  31],
       [ 57, 134]])

In [42]:
labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred_2), index=labels, columns=labels)

Unnamed: 0,died,survived
died,276,31
survived,57,134


In [43]:
model_2_report = classification_report(y_train, y_pred_2)

In [44]:
print(model_2_report)

              precision    recall  f1-score   support

        died       0.83      0.90      0.86       307
    survived       0.81      0.70      0.75       191

    accuracy                           0.82       498
   macro avg       0.82      0.80      0.81       498
weighted avg       0.82      0.82      0.82       498



### 6. Which model performs better on your in-sample data?

### ***Model 2 performs better on in-sample data***

In [45]:
print(model_1_report, model_2_report)

              precision    recall  f1-score   support

        died       0.82      0.86      0.84       307
    survived       0.76      0.70      0.73       191

    accuracy                           0.80       498
   macro avg       0.79      0.78      0.78       498
weighted avg       0.80      0.80      0.80       498
               precision    recall  f1-score   support

        died       0.83      0.90      0.86       307
    survived       0.81      0.70      0.75       191

    accuracy                           0.82       498
   macro avg       0.82      0.80      0.81       498
weighted avg       0.82      0.82      0.82       498



### 7. Which model performs best on your out-of-sample data, the validate set?

### ***Model 2 performs better on the out-of-sample data***

#### Evaluate Model 1 on out-of-sample data

In [46]:
#Model 1 Accuracy for Validate Data set
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.76


In [47]:
y_pred = clf.predict(X_validate)

In [48]:
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

        died       0.80      0.83      0.81       132
    survived       0.70      0.66      0.68        82

    accuracy                           0.76       214
   macro avg       0.75      0.74      0.74       214
weighted avg       0.76      0.76      0.76       214



#### Evaluate Model 2 on out-of-sample data

In [49]:
# Model 2 Accuracy for Validate Data set
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf_2.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.79


In [50]:
y_pred_2 = clf_2.predict(X_validate)

In [51]:
print(classification_report(y_validate, y_pred_2))

              precision    recall  f1-score   support

        died       0.80      0.87      0.83       132
    survived       0.76      0.65      0.70        82

    accuracy                           0.79       214
   macro avg       0.78      0.76      0.77       214
weighted avg       0.78      0.79      0.78       214



## Random Forest Exercises

In [52]:
from sklearn.ensemble import RandomForestClassifier

In [53]:
# split into train, validate, test
train, validate, test = train_validate_test_split(titanic, target='survived', seed=123)

# create X & y version of train, where y is a series with just the target variable and X are all the features. 

X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [54]:
train.head()

Unnamed: 0,sibsp,fare,alone,survived,class_Second,class_Third,sex_male
583,0,40.125,1,died,0,0,1
165,0,20.525,0,survived,0,1,1
50,4,39.6875,0,died,0,1,1
259,0,26.0,0,survived,1,0,0
306,0,110.8833,1,survived,0,0,0


### 1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [55]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=100,
                            max_depth=10, 
                            random_state=123)

In [56]:
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=10, random_state=123)

In [57]:
y_pred = rf.predict(X_train)

In [58]:
y_pred_proba = rf.predict_proba(X_train)

### 2. Evaluate your results using the model score, confusion matrix, and classification report.

In [59]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.94


In [60]:
conf_matrix = (confusion_matrix(y_train, y_pred))
conf_df = pd.DataFrame(conf_matrix, columns=['predict_death', 'predict_survive'], index=['actual_death', 'actual_survive'])
rubric_df = pd.DataFrame([['true positive', 'false negative'],['false positive', 'true negative']], columns=['predict_death', 'predict_survive'], index=['actual_death', 'actual_survive'])
joined = pd.concat([conf_df, rubric_df], axis=1)
joined

Unnamed: 0,predict_death,predict_survive,predict_death.1,predict_survive.1
actual_death,300,7,true positive,false negative
actual_survive,23,168,false positive,true negative


In [61]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

        died       0.93      0.98      0.95       307
    survived       0.96      0.88      0.92       191

    accuracy                           0.94       498
   macro avg       0.94      0.93      0.94       498
weighted avg       0.94      0.94      0.94       498



### 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [62]:
def get_metrics_binary(clf):
    '''
    get_metrics_binary takes in a confusion matrix (cnf) for a binary classifier and prints out metrics based on
    values in variables named X_train, y_train, and y_pred.
    
    return: a classification report as a transposed DataFrame
    '''
    #accuracy = clf.score(X_train, y_train)
    class_report = pd.DataFrame(classification_report(y_train, y_pred, output_dict=True)).T
    clf = confusion_matrix(y_train, y_pred)
    tpr = clf[0][0] / clf[0].sum()
    fpr = clf[0][1] / clf[1].sum()
    tnr = clf[1][1] / clf[1].sum()
    fnr = clf[1][0] / clf[0].sum()
    print(f'''
    The True Positive Rate is {tpr:.2}, The False Positive Rate is {fpr:.2},
    The True Negative Rate is {tnr:.2}, and the False Negative Rate is {fnr:.2}
    ''')
    return class_report

In [63]:
rf_model1_report = get_metrics_binary(conf_df)
rf_model1_report


    The True Positive Rate is 0.98, The False Positive Rate is 0.037,
    The True Negative Rate is 0.88, and the False Negative Rate is 0.075
    


Unnamed: 0,precision,recall,f1-score,support
died,0.928793,0.977199,0.952381,307.0
survived,0.96,0.879581,0.918033,191.0
accuracy,0.939759,0.939759,0.939759,0.939759
macro avg,0.944396,0.92839,0.935207,498.0
weighted avg,0.940762,0.939759,0.939207,498.0


### 4. Run through steps increasing your min_samples_leaf and decreasing your max_depth.

In [64]:
#Random Forest Model 2 with increased min_samples_leaf and decreased mx_depth
rf_2 = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=2,
                            n_estimators=100,
                            max_depth=4, 
                            random_state=123)

In [65]:
#Fit the training data to random forest model 2
rf_2.fit(X_train, y_train)

RandomForestClassifier(max_depth=4, min_samples_leaf=2, random_state=123)

In [66]:
#Set the y_pred
y_pred = rf_2.predict(X_train)

#### Evaluate Model 2

In [67]:
#Return accuracy metric for random forest model 2
print('Model 2 Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf_2.score(X_train, y_train)))

Model 2 Accuracy of random forest classifier on training set: 0.84


In [68]:
#Set the confusion matrix for model 2...treat death as the positive class
conf_matrix = (confusion_matrix(y_train, y_pred))
conf_df = pd.DataFrame(conf_matrix, columns=['predict_death', 'predict_survive'], index=['actual_death', 'actual_survive'])

In [69]:
#Return metrics report for evaluating model 2 
rf_model2_report = get_metrics_binary(conf_df)
rf_model2_report


    The True Positive Rate is 0.94, The False Positive Rate is 0.099,
    The True Negative Rate is 0.68, and the False Negative Rate is 0.2
    


Unnamed: 0,precision,recall,f1-score,support
died,0.822857,0.938111,0.876712,307.0
survived,0.871622,0.675393,0.761062,191.0
accuracy,0.837349,0.837349,0.837349,0.837349
macro avg,0.847239,0.806752,0.818887,498.0
weighted avg,0.84156,0.837349,0.832356,498.0


### 5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

In [70]:
print(f'Model 1 Random Forest Metrics: \n\n {rf_model1_report:}')
print('--------------------------------------')
print(f'Model 2 Random Forest Metrics: \n\n {rf_model2_report:}')

Model 1 Random Forest Metrics: 

               precision    recall  f1-score     support
died           0.928793  0.977199  0.952381  307.000000
survived       0.960000  0.879581  0.918033  191.000000
accuracy       0.939759  0.939759  0.939759    0.939759
macro avg      0.944396  0.928390  0.935207  498.000000
weighted avg   0.940762  0.939759  0.939207  498.000000
--------------------------------------
Model 2 Random Forest Metrics: 

               precision    recall  f1-score     support
died           0.822857  0.938111  0.876712  307.000000
survived       0.871622  0.675393  0.761062  191.000000
accuracy       0.837349  0.837349  0.837349    0.837349
macro avg      0.847239  0.806752  0.818887  498.000000
weighted avg   0.841560  0.837349  0.832356  498.000000


### 6. After making a few models, which one has the best performance (or closest metrics) on both train and validate?

In [71]:
#Random Forest Model 3 with increased min_samples_leaf and decreased mx_depth
rf_3 = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=10,
                            n_estimators=100,
                            max_depth=8, 
                            random_state=123)

In [72]:
rf_3.fit(X_train, y_train)
y_pred = rf_3.predict(X_train)

In [73]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf_3.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.83


In [74]:
conf_matrix = (confusion_matrix(y_train, y_pred))
conf_df = pd.DataFrame(conf_matrix, columns=['predict_death', 'predict_survive'], index=['actual_death', 'actual_survive'])

In [75]:
rf_model3_report = get_metrics_binary(conf_matrix)
rf_model3_report


    The True Positive Rate is 0.95, The False Positive Rate is 0.084,
    The True Negative Rate is 0.63, and the False Negative Rate is 0.23
    


Unnamed: 0,precision,recall,f1-score,support
died,0.806094,0.947883,0.871257,307.0
survived,0.883212,0.633508,0.737805,191.0
accuracy,0.827309,0.827309,0.827309,0.827309
macro avg,0.844653,0.790695,0.804531,498.0
weighted avg,0.835671,0.827309,0.820074,498.0


### ***Random Forest Model 1 performs the best on the training data set***

#### Evaluate the models with out of sample data set

In [76]:
y_pred = rf.predict(X_validate)
print('Model 1 Accuracy of random forest classifier on validate set: {:.2f}'
     .format(rf.score(X_validate, y_validate)))

print('------------')
y_pred = rf_2.predict(X_validate)
print('Model 2 Accuracy of random forest classifier on validate set: {:.2f}'
     .format(rf_2.score(X_validate, y_validate)))

print('------------')
y_pred = rf_3.predict(X_validate)
print('Model 3 Accuracy of random forest classifier on validate set: {:.2f}'
     .format(rf_3.score(X_validate, y_validate)))

Model 1 Accuracy of random forest classifier on validate set: 0.78
------------
Model 2 Accuracy of random forest classifier on validate set: 0.79
------------
Model 3 Accuracy of random forest classifier on validate set: 0.79


### ***Random Forest Model 1 performs better on the validate data set***