In [1]:
import acquire
import prepare
import matplotlib.pyplot as plt
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
import warnings

import pandas as pd
warnings.filterwarnings("ignore")

In [2]:
titanic_df = acquire.get_titanic()

In [3]:
prepped_titanic = prepare.prep_titanic(titanic_df)

In [4]:
train, validate, test= prepare.split_titanic_data(prepped_titanic)

In [5]:
train, validate, test =prepare.impute_titanic_mode(train, validate, test)

In [6]:
train.columns

Index(['survived', 'age', 'sibsp', 'fare', 'alone', 'sex_female', 'sex_male',
       'class_First', 'class_Second', 'class_Third', 'embark_town_Cherbourg',
       'embark_town_Queenstown', 'embark_town_Southampton'],
      dtype='object')

In [7]:
# drop object columns and create X_train of features only 
# and y_train of survived only. 
X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

# check the shape
X_train.shape, X_validate.shape, X_test.shape

((498, 12), (214, 12), (179, 12))

In [8]:
y_train.value_counts()

0    307
1    191
Name: survived, dtype: int64

In [32]:
# compute accuracy of baseline
cm = confusion_matrix(y_train, y_train_pred);cm

array([[275,  32],
       [ 95,  96]])

In [34]:
tn, fp, fn, tp = cm.ravel(order='C'); tn, fp, fn, tp

(275, 32, 95, 96)

In [35]:
accuracy = (tp+tn)/(tn+fp+fn+tp)

In [36]:
accuracy # percentage of accuracy

0.7449799196787149

In [37]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_train_pred = knn.predict(X_train)

In [38]:
accuracy_train = knn.score(X_train, y_train); accuracy_train

0.7771084337349398

In [39]:
print('Accuracy of KNN classifier on training set: {:.2f}' # Accuracy train percentage
     .format(knn.score(X_train, y_train)))

Accuracy of KNN classifier on training set: 0.78


In [40]:
cm = confusion_matrix(y_train, y_train_pred); cm # Confusion matrix
pd.DataFrame(cm, index=['Actual 0', 'Actual 1'], # DataFrame of KNN confusion matrix
            columns=['Pred 0', 'Pred 1'])

Unnamed: 0,Pred 0,Pred 1
Actual 0,267,40
Actual 1,71,120


In [41]:
pd.DataFrame(classification_report(y_train, y_train_pred, output_dict=True))

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.789941,0.75,0.777108,0.76997,0.774622
recall,0.869707,0.628272,0.777108,0.74899,0.777108
f1-score,0.827907,0.683761,0.777108,0.755834,0.772622
support,307.0,191.0,0.777108,498.0,498.0


##### TP FP TN FN Precision Recall F1-Score and Support

In [42]:
accuracy = (tp+tn)/(tn+fp+fn+tp); print(f'Accuracy: {accuracy}')
true_positive_rate = tp/(tp+fn); print(f'Accuracy: {true_positive_rate}')
false_positive_rate = fp/(fp + tn); print(f'False Positive Rate: {false_positive_rate}')
true_negative_rate = tn/(tn + fp); print(f'True Negative Rate: {true_negative_rate}')
false_negative_rate = fn/(fn + tp); print(f'False Negative Rate: {false_negative_rate}')
precision = tp/(tp + fp); print(f"Precision: {precision}")
recall = tp/(tp + fn); print(f"Recall: {recall}")
f1_score = 2*(precision*recall)/(precision+recall); print(f"F1 Score: {f1_score}")
support_pos = tp + fn; print(f"Support (0): {support_pos}")
support_neg = fp + tn; print(f"Support (1): {support_neg}")
    

Accuracy: 0.7449799196787149
Accuracy: 0.5026178010471204
False Positive Rate: 0.10423452768729642
True Negative Rate: 0.8957654723127035
False Negative Rate: 0.4973821989528796
Precision: 0.75
Recall: 0.5026178010471204
F1 Score: 0.6018808777429466
Support (0): 191
Support (1): 307


In [11]:
knn = KNeighborsClassifier(n_neighbors=10)

In [12]:
knn.fit(X_train, y_train)

In [13]:
y_train_pred = knn.predict(X_train)

In [14]:
y_validate_pred = knn.predict(X_validate)

In [15]:
knn, y_train, y_validate_pred

(KNeighborsClassifier(n_neighbors=10),
 707    1
 515    0
 455    1
 876    0
 754    1
       ..
 149    0
 321    0
 439    0
 267    1
 152    0
 Name: survived, Length: 498, dtype: int64,
 array([0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0,
        0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))

In [20]:
accuracy = knn.score(X_train, y_train); accuracy

0.7449799196787149

In [24]:
cm = confusion_matrix(y_train, y_train_pred); cm

array([[275,  32],
       [ 95,  96]])

In [26]:
pd.DataFrame(cm, index=['Actual 0', 'Actual 1'], 
                       columns=['Pred 0', 'Pred 1'])

Unnamed: 0,Pred 0,Pred 1
Actual 0,275,32
Actual 1,95,96


In [27]:
pd.DataFrame(classification_report(y_train, y_train_pred, output_dict=True))

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.743243,0.75,0.74498,0.746622,0.745835
recall,0.895765,0.502618,0.74498,0.699192,0.74498
f1-score,0.812408,0.601881,0.74498,0.707144,0.731663
support,307.0,191.0,0.74498,498.0,498.0


In [30]:
y_train_pred, y_validate_pred

(array([0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0,
        0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0,
        0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0,
        0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
        0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 

In [43]:
knn.fit(X_validate, y_validate)

In [49]:
y_train_pred = knn.predict(X_train)

In [None]:
y_validate_pred = knn.predict(X_validate)

In [47]:
accuracy = knn.score(X_validate, y_validate); accuracy

0.780373831775701

In [51]:
cm = confusion_matrix(y_validate,y_validate_pred);cm

array([[115,  17],
       [ 30,  52]])

In [52]:
pd.DataFrame(classification_report(y_validate, y_validate_pred, output_dict=True))

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.793103,0.753623,0.780374,0.773363,0.777975
recall,0.871212,0.634146,0.780374,0.752679,0.780374
f1-score,0.830325,0.688742,0.780374,0.759533,0.776073
support,132.0,82.0,0.780374,214.0,214.0


In [53]:
tn, fp, fn, tp = cm.ravel(order='C'); tn, fp, fn, tp

(115, 17, 30, 52)

In [54]:
accuracy = (tp+tn)/(tn+fp+fn+tp); print(f'Accuracy: {accuracy}')
true_positive_rate = tp/(tp+fn); print(f'Accuracy: {true_positive_rate}')
false_positive_rate = fp/(fp + tn); print(f'False Positive Rate: {false_positive_rate}')
true_negative_rate = tn/(tn + fp); print(f'True Negative Rate: {true_negative_rate}')
false_negative_rate = fn/(fn + tp); print(f'False Negative Rate: {false_negative_rate}')
precision = tp/(tp + fp); print(f"Precision: {precision}")
recall = tp/(tp + fn); print(f"Recall: {recall}")
f1_score = 2*(precision*recall)/(precision+recall); print(f"F1 Score: {f1_score}")
support_pos = tp + fn; print(f"Support (0): {support_pos}")
support_neg = fp + tn; print(f"Support (1): {support_neg}")
    

Accuracy: 0.780373831775701
Accuracy: 0.6341463414634146
False Positive Rate: 0.12878787878787878
True Negative Rate: 0.8712121212121212
False Negative Rate: 0.36585365853658536
Precision: 0.7536231884057971
Recall: 0.6341463414634146
F1 Score: 0.6887417218543047
Support (0): 82
Support (1): 132
