### Use the titanic data

In [1]:
import numpy as np
import pandas as pd
import acquire
import prepare
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

### Acquire the data

In [2]:
titanic = acquire.get_titanic_data()
titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


### Prepare the data

In [3]:
titanic = prepare.prep_titanic(titanic)
titanic.head()

Unnamed: 0,survived,pclass,num_sib_and_sp,num_par_and_ch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,3,1,0,7.25,0,1,0,1
1,1,1,1,0,71.2833,0,0,0,0
2,1,3,0,0,7.925,1,0,0,1
3,1,1,1,0,53.1,0,0,0,1
4,0,3,0,0,8.05,1,1,0,1


In [4]:
#Split the data into train, validate, and test subsets
train, validate, test = prepare.train_validate_test_split(titanic, 'survived')

In [5]:
#Separate the X and y variables
X_train, y_train = train.drop('survived', axis = 1), train.survived
X_validate, y_validate = validate.drop('survived', axis = 1), validate.survived
X_test, y_test = test.drop('survived', axis = 1), test.survived

## Decision Tree

### What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [6]:
#Target variable is survived. Baseline will be the most common value.
train.survived.value_counts()

0    307
1    191
Name: survived, dtype: int64

In [7]:
#Since most people did not survive, this will be the baseline prediction.
#Use the dummy classifier to set the baseline
from sklearn.dummy import DummyClassifier

baseline = DummyClassifier(strategy = 'constant', constant = 0)
baseline.fit(X_train, y_train)

#Now get the baseline accuracy
baseline.score(X_validate, y_validate)

0.616822429906542

### Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [8]:
#Instantiate the decision tree classifier
model1 = DecisionTreeClassifier(max_depth = 5, random_state = 123)

#Fit the model
model1.fit(X_train, y_train)

#Make predictions
model1_preds = model1.predict(X_train)

### Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [9]:
#Model score
model1.score(X_train, y_train)

0.8373493975903614

In [10]:
#Confusion Matrix
pd.DataFrame(confusion_matrix(y_train, model1_preds))

Unnamed: 0,0,1
0,303,4
1,77,114


In [11]:
#Classification Report
print(classification_report(y_train, model1_preds))

              precision    recall  f1-score   support

           0       0.80      0.99      0.88       307
           1       0.97      0.60      0.74       191

    accuracy                           0.84       498
   macro avg       0.88      0.79      0.81       498
weighted avg       0.86      0.84      0.83       498



### Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [12]:
#Precision, recall, f1-score, and support are listed above
tp_1 = 114
tn_1 = 303
fp_1 = 4
fn_1 = 77

### Run through steps 2-4 using a different max_depth value.

In [13]:
#Instantiate model
model2 = DecisionTreeClassifier(max_depth = 9, random_state = 123)

#Fit the model
model2.fit(X_train, y_train)

#Make predictions
model2_preds = model2.predict(X_train)

In [14]:
#Model score
model2.score(X_train, y_train)

0.9096385542168675

In [15]:
#Confusion Matrix
#index = actual
#columns = predictions
pd.DataFrame(confusion_matrix(y_train, model2_preds))

Unnamed: 0,0,1
0,298,9
1,36,155


In [16]:
#Classification Report
print(classification_report(y_train, model2_preds))

              precision    recall  f1-score   support

           0       0.89      0.97      0.93       307
           1       0.95      0.81      0.87       191

    accuracy                           0.91       498
   macro avg       0.92      0.89      0.90       498
weighted avg       0.91      0.91      0.91       498



In [17]:
#Calculate tp, tn, fp, fn
tp = 155
tn = 298
fp = 9
fn = 36

### Which model performs better on your in-sample data?

My model2 performs better on in sample data.

### Which model performs best on your out-of-sample data, the validate set?

In [18]:
#Compare general accuracies

#For model1
model1.score(X_validate, y_validate)

0.7570093457943925

In [19]:
#For model2
model2.score(X_validate, y_validate)

0.7616822429906542

In [20]:
#Check classification reports

#For model1
model1_preds = model1.predict(X_validate)
print(classification_report(y_validate, model1_preds))

              precision    recall  f1-score   support

           0       0.74      0.94      0.83       132
           1       0.83      0.46      0.59        82

    accuracy                           0.76       214
   macro avg       0.78      0.70      0.71       214
weighted avg       0.77      0.76      0.74       214



In [21]:
#For model2
model2_preds = model2.predict(X_validate)
print(classification_report(y_validate, model2_preds))

              precision    recall  f1-score   support

           0       0.78      0.86      0.82       132
           1       0.72      0.61      0.66        82

    accuracy                           0.76       214
   macro avg       0.75      0.73      0.74       214
weighted avg       0.76      0.76      0.76       214



In [22]:
#Check confusion matrices

#For model1
pd.DataFrame(confusion_matrix(y_validate, model1_preds))

Unnamed: 0,0,1
0,124,8
1,44,38


In [23]:
#For model2
pd.DataFrame(confusion_matrix(y_validate, model2_preds))

Unnamed: 0,0,1
0,113,19
1,32,50


While accuracy is about the same for both models, their precision and recall for survivor predictions were quite different. Based on these numbers, and assuming its more important to predict who survived rather than those who did not, it seems model2 is the better performer.

## Random Forest

### Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [24]:
from sklearn.ensemble import RandomForestClassifier

#Instantiate the model
clf_1 = RandomForestClassifier(min_samples_leaf = 1, max_depth = 10, random_state = 123)

#Fit the model to training data
clf_1.fit(X_train, y_train)

#Make predicitons
clf_1_preds = clf_1.predict(X_train)

### Evaluate your results using the model score, confusion matrix, and classification report.

In [25]:
#model score
accuracy = clf_1.score(X_train, y_train)

In [26]:
#confusion matrix
pd.DataFrame(confusion_matrix(y_train, clf_1_preds))

Unnamed: 0,0,1
0,301,6
1,22,169


In [27]:
#Classification report
print(classification_report(y_train, clf_1_preds))

              precision    recall  f1-score   support

           0       0.93      0.98      0.96       307
           1       0.97      0.88      0.92       191

    accuracy                           0.94       498
   macro avg       0.95      0.93      0.94       498
weighted avg       0.94      0.94      0.94       498



### Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [28]:
#Precision, recall, f1-score and support are all clearly labeled above.

print(f'Accuracy: {accuracy}')
print(f'True Positives: 169')
print(f'False Positives: 6')
print(f'True Negatives: 301')
print(f'False Negatives: 22')

Accuracy: 0.9437751004016064
True Positives: 169
False Positives: 6
True Negatives: 301
False Negatives: 22


### Run through steps increasing your min_samples_leaf and decreasing your max_depth.

In [29]:
#Instantiate the model
clf_2 = RandomForestClassifier(random_state = 123, min_samples_leaf = 3, max_depth = 7)

#Fit the model
clf_2.fit(X_train, y_train)

#Make preds
clf_2_preds = clf_2.predict(X_train)

In [30]:
#Accuracy
accuracy = clf_2.score(X_train, y_train)

In [31]:
#Confusion Matrix
pd.DataFrame(confusion_matrix(y_train, clf_2_preds))

Unnamed: 0,0,1
0,294,13
1,48,143


In [32]:
#Classification Report
print(classification_report(y_train, clf_2_preds))

              precision    recall  f1-score   support

           0       0.86      0.96      0.91       307
           1       0.92      0.75      0.82       191

    accuracy                           0.88       498
   macro avg       0.89      0.85      0.87       498
weighted avg       0.88      0.88      0.87       498



In [33]:
print(f'Accuracy: {accuracy}')
print('True Positives: 143')
print('False Positives: 13')
print('True Negatives: 294')
print('False Negatives: 48')

Accuracy: 0.8775100401606426
True Positives: 143
False Positives: 13
True Negatives: 294
False Negatives: 48


In [34]:
#Instantiate the model
clf_3 = RandomForestClassifier(random_state = 123, min_samples_leaf = 5, max_depth = 4)

#Fit the model
clf_3.fit(X_train, y_train)

#Make preds
clf_3_preds = clf_3.predict(X_train)

In [35]:
#Accuracy
accuracy = clf_3.score(X_train, y_train)

In [36]:
#Confusion Matrix
pd.DataFrame(confusion_matrix(y_train, clf_3_preds))

Unnamed: 0,0,1
0,287,20
1,66,125


In [37]:
#Classification Report
print(classification_report(y_train, clf_3_preds))

              precision    recall  f1-score   support

           0       0.81      0.93      0.87       307
           1       0.86      0.65      0.74       191

    accuracy                           0.83       498
   macro avg       0.84      0.79      0.81       498
weighted avg       0.83      0.83      0.82       498



In [38]:
print(f'Accuracy: {accuracy}')
print('True Positives: 125')
print('False Positives: 20')
print('True Negatives: 287')
print('False Negatives: 66')

Accuracy: 0.8273092369477911
True Positives: 125
False Positives: 20
True Negatives: 287
False Negatives: 66


### What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

With every increase in min_samples_leaf and decrease in max_depth, the models perform worse. True positives and negatives go down, and false positives and negatives go up. My first model with min_samples_leaf = 1 and max_depth = 10 performed the best on the training data. I think this is because the higher max_depth is allowing the model to overfit.

### After making a few models, which one has the best performance (or closest metrics) on both train and validate?

In [39]:
#Testing each model on Train data set
models = [clf_1, clf_2, clf_3]

for model in models:
    preds = model.predict(X_train)
    print(f'{model}')
    print(f'Accuracy: {model.score(X_train, y_train)}')
    print(f'True Positives: {confusion_matrix(y_train, preds)[1][1]}')
    print(f'False Positives: {confusion_matrix(y_train, preds)[0][1]}')
    print(f'True Negatives: {confusion_matrix(y_train, preds)[0][0]}')
    print(f'False Negatvies: {confusion_matrix(y_train, preds)[1][0]}')
    print(classification_report(y_train, preds))
    print('\n')

RandomForestClassifier(max_depth=10, random_state=123)
Accuracy: 0.9437751004016064
True Positives: 169
False Positives: 6
True Negatives: 301
False Negatvies: 22
              precision    recall  f1-score   support

           0       0.93      0.98      0.96       307
           1       0.97      0.88      0.92       191

    accuracy                           0.94       498
   macro avg       0.95      0.93      0.94       498
weighted avg       0.94      0.94      0.94       498



RandomForestClassifier(max_depth=7, min_samples_leaf=3, random_state=123)
Accuracy: 0.8775100401606426
True Positives: 143
False Positives: 13
True Negatives: 294
False Negatvies: 48
              precision    recall  f1-score   support

           0       0.86      0.96      0.91       307
           1       0.92      0.75      0.82       191

    accuracy                           0.88       498
   macro avg       0.89      0.85      0.87       498
weighted avg       0.88      0.88      0.87       498

In [40]:
#Testing each model on validate data set

for model in models:
    preds = model.predict(X_validate)
    print(f'{model}')
    print(f'Accuracy: {model.score(X_validate, y_validate)}')
    print(f'True Positives: {confusion_matrix(y_validate, preds)[1][1]}')
    print(f'False Positives: {confusion_matrix(y_validate, preds)[0][1]}')
    print(f'True Negatives: {confusion_matrix(y_validate, preds)[0][0]}')
    print(f'False Negatvies: {confusion_matrix(y_validate, preds)[1][0]}')
    print(classification_report(y_validate, preds))
    print('\n')

RandomForestClassifier(max_depth=10, random_state=123)
Accuracy: 0.7850467289719626
True Positives: 51
False Positives: 15
True Negatives: 117
False Negatvies: 31
              precision    recall  f1-score   support

           0       0.79      0.89      0.84       132
           1       0.77      0.62      0.69        82

    accuracy                           0.79       214
   macro avg       0.78      0.75      0.76       214
weighted avg       0.78      0.79      0.78       214



RandomForestClassifier(max_depth=7, min_samples_leaf=3, random_state=123)
Accuracy: 0.8130841121495327
True Positives: 53
False Positives: 11
True Negatives: 121
False Negatvies: 29
              precision    recall  f1-score   support

           0       0.81      0.92      0.86       132
           1       0.83      0.65      0.73        82

    accuracy                           0.81       214
   macro avg       0.82      0.78      0.79       214
weighted avg       0.81      0.81      0.81       214


After comparing each model's performance on the validate set to its performance on the training set, it seems that model 3 has the most consistent performance from set to set. However, model 2 actually performed the best on the validate data set.

## KNN

### Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

In [42]:
from sklearn.neighbors import KNeighborsClassifier

#Instantiate the model
knn_1 = KNeighborsClassifier(n_neighbors = 1, weights = 'uniform')

#Fit the model to training data
knn_1.fit(X_train, y_train)

#Make predictions on training data
knn_1_preds = knn_1.predict(X_train)

### Evaluate your results using the model score, confusion matrix, and classification report.

In [44]:
#Accuracy
accuracy = knn_1.score(X_train, y_train)

In [45]:
#Confusion Matrix
pd.DataFrame(confusion_matrix(y_train, knn_1_preds))

Unnamed: 0,0,1
0,297,10
1,21,170


In [46]:
print(classification_report(y_train, knn_1_preds))

              precision    recall  f1-score   support

           0       0.93      0.97      0.95       307
           1       0.94      0.89      0.92       191

    accuracy                           0.94       498
   macro avg       0.94      0.93      0.93       498
weighted avg       0.94      0.94      0.94       498



### Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [47]:
#Precision, Recall, f1-score, and support are all clearly labeled above.
print(f'Accuracy: {accuracy}')
print('True Positives: 170')
print('True Negatives: 297')
print('False Positives: 10')
print('False Negatives: 21')

Accuracy: 0.9377510040160643
True Positives: 170
True Negatives: 297
False Positives: 10
False Negatives: 21


### Run through steps 2-4 setting k to 10

In [48]:
#Instantiate the model
knn_2 = KNeighborsClassifier(n_neighbors = 10, weights = 'uniform')

#Fit the model
knn_2.fit(X_train, y_train)

#Make predictions
knn_2_preds = knn_2.predict(X_train)

In [49]:
#Accuracy
accuracy = knn_2.score(X_train, y_train)

In [50]:
#Confusion Matrix
pd.DataFrame(confusion_matrix(y_train, knn_2_preds))

Unnamed: 0,0,1
0,267,40
1,68,123


In [51]:
#Classification Report
print(classification_report(y_train, knn_2_preds))

              precision    recall  f1-score   support

           0       0.80      0.87      0.83       307
           1       0.75      0.64      0.69       191

    accuracy                           0.78       498
   macro avg       0.78      0.76      0.76       498
weighted avg       0.78      0.78      0.78       498



In [53]:
print(f'Accuracy: {accuracy}')
print('True Positives: 123')
print('True Negatives: 267')
print('False Positives: 40')
print('False Negatives: 68')

Accuracy: 0.7831325301204819
True Positives: 123
True Negatives: 267
False Positives: 40
False Negatives: 68


### Run through setps 2-4 setting k to 20

In [54]:
#Instantiate the model
knn_3 = KNeighborsClassifier(n_neighbors = 20, weights = 'uniform')

#Fit the model
knn_3.fit(X_train, y_train)

#Make predictions
knn_3_preds = knn_3.predict(X_train)

In [55]:
#Accuracy
accuracy = knn_3.score(X_train, y_train)

In [56]:
#Confusion Matrix
pd.DataFrame(confusion_matrix(y_train, knn_3_preds))

Unnamed: 0,0,1
0,263,44
1,87,104


In [63]:
#Classification Report
pd.DataFrame(classification_report(y_train, knn_3_preds, output_dict = True))

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.751429,0.702703,0.736948,0.727066,0.732741
recall,0.856678,0.544503,0.736948,0.70059,0.736948
f1-score,0.800609,0.613569,0.736948,0.707089,0.728873
support,307.0,191.0,0.736948,498.0,498.0


In [58]:
print(f'Accuracy: {accuracy}')
print('True Positives: 104')
print('True Negatives: 263')
print('False Positives: 44')
print('False Negatives: 87')

Accuracy: 0.7369477911646586
True Positives: 104
True Negatives: 263
False Positives: 44
False Negatives: 87


### What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

As k was increased, the overall accuracy of the model went down. The first model with k = 1 performed the best on the in-sample data. I think the reason for this is that as k increases, the likelihood that unrelated data points get taken into consideration increases.

### Which model performs best on our out-of-sample data from validate?

In [65]:
models = [knn_1, knn_2, knn_3]
metrics = []

for model in models:
    #Make predictions
    knn_preds = model.predict(X_validate)
    train_accuracy = model.score(X_train, y_train)
    
    output = {
        'N_Neighbors': model.n_neighbors,
        'True Positves': confusion_matrix(y_validate, knn_preds)[1][1],
        'False Positives': confusion_matrix(y_validate, knn_preds)[0][1],
        'True Negatives': confusion_matrix(y_validate, knn_preds)[0][0],
        'False Negatvies': confusion_matrix(y_validate, knn_preds)[1][0],
        'Accuracy': model.score(X_validate, y_validate),
        'Difference': train_accuracy - model.score(X_validate, y_validate)
    }
    
    metrics.append(output)
    
pd.DataFrame(metrics)

Unnamed: 0,N_Neighbors,True Positves,False Positives,True Negatives,False Negatvies,Accuracy,Difference
0,1,49,25,107,33,0.728972,0.208779
1,10,46,25,107,36,0.714953,0.068179
2,20,38,26,106,44,0.672897,0.064051


The first model with k = 1 performed the best on our validate data set, but also had a huge change in accuracy from the training set. For this reason, I think model 2 with k = 10 is the best choice. It offers nearly the same accuracy as the first model on the validate set, but with a much smaller change in accuracy from the training data set.