Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

Evaluate your results using the model score, confusion matrix, and classification report.

Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

Run through steps 2-4 setting k to 10

Run through setps 2-4 setting k to 20

What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

Which model performs best on our out-of-sample data from validate?

In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from pydataset import data

# read titanic data from acquire file data 
import acquire
import env


df  = acquire.get_titanic_data()
train, validate, test = acquire.prep_titanic_data(df)
train.sample(10)

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
191,0,2,19.0,0,0,13.0,1,1,0,1
565,0,3,24.0,2,0,24.15,0,1,0,1
659,0,1,58.0,0,2,113.275,0,1,0,0
492,0,1,55.0,0,0,30.5,1,1,0,1
115,0,3,21.0,0,0,7.925,1,1,0,1
479,1,3,2.0,0,1,12.2875,0,0,0,1
192,1,3,19.0,1,0,7.8542,0,0,0,1
460,1,1,48.0,0,0,26.55,1,1,0,1
875,1,3,15.0,0,0,7.225,1,0,0,0
218,1,1,32.0,0,0,76.2917,1,0,0,0


In [2]:
train.shape, validate.shape, test.shape

((498, 10), (214, 10), (179, 10))

In [5]:
x_cols = ['pclass', 'age', 'sex_male', 'fare']
y_col = "survived"


In [12]:
X_train, y_train = train[x_cols], train[y_col]
X_validate, y_validate = validate[x_cols], validate[y_col]
X_test, y_test = test[x_cols], test[y_col]

In [9]:
x_train.head()

Unnamed: 0,pclass,age,sex_male,fare
301,3,30.189296,1,23.25
290,1,26.0,0,78.85
779,1,43.0,0,211.3375
356,1,22.0,0,55.0
147,3,9.0,0,34.375


In [14]:
knn1 = KNeighborsClassifier(1)
knn1.fit(X_train, y_train)  
# CLASSIFICATION METRICS
y_pred = knn1.predict(X_train)


In [15]:
# compare predictor and train set
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       307
           1       0.96      0.99      0.98       191

    accuracy                           0.98       498
   macro avg       0.98      0.99      0.98       498
weighted avg       0.98      0.98      0.98       498



In [16]:
report = classification_report(y_train, y_pred, output_dict=True)
print("n_neighbour = 1")
pd.DataFrame(report)


n_neighbour = 1


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.996678,0.964467,0.983936,0.980572,0.984324
recall,0.977199,0.994764,0.983936,0.985982,0.983936
f1-score,0.986842,0.979381,0.983936,0.983112,0.983981
support,307.0,191.0,0.983936,498.0,498.0


In [17]:
confusion_matrix(y_train, y_pred)


array([[300,   7],
       [  1, 190]])

In [18]:
print('Actual on Left, Predicted on Top')
pd.crosstab(y_train, y_pred)


Actual on Left, Predicted on Top


col_0,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,300,7
1,1,190


In [19]:
# Lets be nicer and say that the "positive" outcome is survival = 1
TN, FP, FN, TP = confusion_matrix(y_train,y_pred).ravel()
ALL = TP + TN + FP + FN

TN, FP, FN, TP

(300, 7, 1, 190)

In [21]:
def show_scores(TN, FP, FN, TP):
    
    ALL = TP + TN + FP + FN
    
    accuracy = (TP + TN)/ALL # How often did the model get it right?
    precision = TP/(TP+FP) # What is the quality of a positive prediction made by the model?
    recall = TP/(TP+FN) # How many of the true positives were found?   
    
    true_positive_rate = TP/(TP+FN) # Same as recall, actually
    true_negative_rate = TN/(TN+FP) # How many of the true negatives were found?
    false_positive_rate = FP/(FP+TN) # How often did we miss the negative and accidentally call it positive?
    false_negative_rate = FN/(FN+TP) # How often did we miss the positive and accidentally call it negative?
    
    f1_score = 2*(precision*recall)/(precision+recall) # Harmonic mean, good for imbalanced data sets
    support_pos = TP + FN # Number of actual positives in the sample
    support_neg = FP + TN # Number of actual negatives in the sample
    
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"True Positive Rate: {true_positive_rate}")
    print(f"True Negative Rate: {true_negative_rate}")
    print(f"False Positive Rate: {false_positive_rate}")
    print(f"False Negative Rate: {false_negative_rate}")
    print(f"F1 Score: {f1_score}")
    print(f"Support (0): {support_pos}")
    print(f"Support (1): {support_neg}")

In [23]:
show_scores(TN,  FP, FN, TP)

Accuracy: 0.9839357429718876
Precision: 0.9644670050761421
Recall: 0.9947643979057592
True Positive Rate: 0.9947643979057592
True Negative Rate: 0.9771986970684039
False Positive Rate: 0.02280130293159609
False Negative Rate: 0.005235602094240838
F1 Score: 0.9793814432989691
Support (0): 191
Support (1): 307


for 10 neighbors

In [25]:
knn10 = KNeighborsClassifier(10)
knn10.fit(X_train, y_train)  
# CLASSIFICATION METRICS
y_pred = knn10.predict(X_train)

report = classification_report(y_train, y_pred, output_dict=True)
print("n_neighbour = 10")
pd.DataFrame(report)

n_neighbour = 10


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.736986,0.714286,0.730924,0.725636,0.72828
recall,0.876221,0.497382,0.730924,0.686802,0.730924
f1-score,0.800595,0.58642,0.730924,0.693507,0.718452
support,307.0,191.0,0.730924,498.0,498.0


In [26]:

# confusion_matrix
print('Actual on Left, Predicted on Top')
pd.crosstab(y_train, y_pred)

Actual on Left, Predicted on Top


col_0,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,269,38
1,96,95


knn classifier with n_neighbor = 20


In [28]:

knn20 = KNeighborsClassifier(20)
knn20.fit(X_train, y_train)
y_pred = knn20.predict(X_train)

report = classification_report(y_train, y_pred, output_dict=True)
print("n_neighbour = 20")
pd.DataFrame(report)

n_neighbour = 20


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.713528,0.68595,0.706827,0.699739,0.702951
recall,0.876221,0.434555,0.706827,0.655388,0.706827
f1-score,0.78655,0.532051,0.706827,0.6593,0.688941
support,307.0,191.0,0.706827,498.0,498.0


In [29]:
print('Actual on Left, Predicted on Top')
pd.crosstab(y_train, y_pred)

Actual on Left, Predicted on Top


col_0,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,269,38
1,108,83


What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

In [None]:
# it seems like when there is only one neighbour the accuracy is at its highest because the model only fitted to one neighbor