# K Nearest Neighbors Model with Bayes

Refined kNN classification model with Bayesian input from the very-accurate breed classification

# Imports

In [51]:
## Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from seaborn import set_style
import csv
set_style("whitegrid")

## More specific imports for this notebook
import joblib ## For saving trained models
from sklearn.neighbors import KNeighborsClassifier ## Import the model here
from sklearn.model_selection import train_test_split ## Import train_test_split
from sklearn.metrics import confusion_matrix ## Import confusion_matrix
from sklearn.metrics import accuracy_score

# Initial Settings and Load Data

In [52]:
X_orig = np.load('../../data/processed_data/specgram_db_pad_trunc.npy')

X_specaug1 = np.load('../../data/processed_data/augment_data/spec_pt_ftmask1.npy')
X_tsh1 = np.load('../../data/processed_data/augment_data/spec_pt_timeshift1.npy')
X_randn1 = np.load('../../data/processed_data/augment_data/spec_pt_randnoise1.npy')
X_tstr1 = np.load('../../data/processed_data/augment_data/spec_pt_timestretch1.npy')

X_specaug2 = np.load('../../data/processed_data/augment_data/spec_pt_ftmask2.npy')
X_tsh2 = np.load('../../data/processed_data/augment_data/spec_pt_timeshift2.npy')
X_randn2 = np.load('../../data/processed_data/augment_data/spec_pt_randnoise2.npy')
X_tstr2 = np.load('../../data/processed_data/augment_data/spec_pt_timestretch2.npy')

X_specaug3 = np.load('../../data/processed_data/augment_data/spec_pt_ftmask3.npy')
X_tsh3 = np.load('../../data/processed_data/augment_data/spec_pt_timeshift3.npy')
X_randn3 = np.load('../../data/processed_data/augment_data/spec_pt_randnoise3.npy')
X_tstr3 = np.load('../../data/processed_data/augment_data/spec_pt_timestretch3.npy')

In [99]:
## Choose what to add to the orignal data
x_data = np.concatenate((X_orig,X_specaug1), axis=0)
X = []
for x in x_data:
    X.append(x.flatten())
aug_file = 'specaug1' #Update when we want to save a file!!!

df = pd.read_csv("../../data/processed_data/metadata.csv")

## Choose which category we are dealing with
situation_to_number = {'brushing': 0, 'food': 1, 'isolation': 2}
breed_to_number = {'european_shorthair': 0, 'maine_coon': 1}
sex_to_number = {0:0, 1:1} ## Sex already transfered to number; this is for uniformity of the code
breed_and_situation_to_number = {'brushing_and_european_shorthair': 0, 'brushing_and_maine_coon': 1,
                    'food_and_european_shorthair': 2, 'food_and_maine_coon': 3,
                    'isolation_and_european_shorthair': 4, 'isolation_and_maine_coon': 5} ## Use for Bayesian study
category_to_number = breed_and_situation_to_number

## Create a new column with numerical values based on the situation mapping
model_types = ['situation', 'breed'] ## Predict model_types[0] after filtering by model_types[1]
model_type = model_types[0] + '_given_' + model_types[1]

df['numerical_'+model_types[0]] = df[model_types[0]].map(situation_to_number)
df['numerical_'+model_types[1]] = df[model_types[1]].map(breed_to_number)
df['numerical_'+model_type] = 2*df['numerical_'+model_types[0]] + df['numerical_'+model_types[1]]

y_situation = np.concatenate((df['numerical_'+model_types[0]].values, df['numerical_'+model_types[0]].values), axis=0)
y_breed = np.concatenate((df['numerical_'+model_types[1]].values, df['numerical_'+model_types[1]].values), axis=0)
y_combined = np.concatenate((df['numerical_'+model_type].values, df['numerical_'+model_type].values), axis=0)

y_orig = df['numerical_'+model_type].values
y = np.concatenate((y_orig, y_orig), axis=0)

X_split = [[],[]]
y_split = [[],[]]
for a in range(0,len(X)):
    X_split[ y_combined[a]%2 ].append(X[a])
    y_split[ y_combined[a]%2 ].append(y[a])


print(x_data.shape)
print(y_situation.shape)
print(y_breed.shape)
print(y_combined.shape)
print(y.shape)

(880, 128, 67)
(880,)
(880,)
(880,)
(880,)


# Train Test Splits

In [100]:
## Set up the train test splits we need

#Use these variables to automate saving runs with different filesnames
test_size = 1/5
random_state = 440

## Train test split for modelinlg the combined breed
x_train_br, x_test_br, y_train_br, y_test_br = train_test_split(X.copy(), y_breed,
                                        shuffle = True,
                                        random_state = random_state,
                                        test_size=test_size)

## Train test split for modelinlg the combined situation 
x_train_sit, x_test_sit, y_train_sit, y_test_sit = train_test_split(X.copy(), y_situation,
                                        shuffle = True,
                                        random_state = random_state,
                                        test_size=test_size)

## Train test split for modelinlg the combined breed-situation combination
x_train_com, x_test_com, y_train_com, y_test_com = train_test_split(X.copy(), y_combined,
                                        shuffle = True,
                                        random_state = random_state,
                                        test_size=test_size)

## Train test split to modeling the situation for shorthairs
x_train_sh, x_test_sh, y_train_sh, y_test_sh = train_test_split(X_split[0].copy(), y_split[0],
                                        shuffle = True,
                                        random_state = random_state,
                                        test_size=test_size)

## Train test split for modeling the situation for maine coons
x_train_mc, x_test_mc, y_train_mc, y_test_mc = train_test_split(X_split[1].copy(), y_split[1],
                                        shuffle = True,
                                        random_state = random_state,
                                        test_size=test_size)

# Fit Models

In [111]:
## Use these variables later to automate saving runs with different filesnames

k = 4
k_breed = k #optimal value 4 from other file
k_situation = k #optimal value 5 from other file
k_combined = k #optimized in this file
k_shorthair = k #optimized in this file
k_mainecoon = k #optimized in this file

## Make the model objects
knn_breed = KNeighborsClassifier(k_breed)
knn_situation = KNeighborsClassifier(k_situation)
knn_combined = KNeighborsClassifier(k_combined)
knn_shorthair = KNeighborsClassifier(k_shorthair)
knn_mainecoon = KNeighborsClassifier(k_mainecoon)

## "Fit" the model object
knn_breed.fit(x_train_br, y_train_br)
knn_situation.fit(x_train_sit, y_train_sit)
knn_combined.fit(x_train_com, y_train_com)
knn_shorthair.fit(x_train_sh, y_train_sh)
knn_mainecoon.fit(x_train_mc, y_train_mc)

# Assess Model Performances

In [112]:
## Predict on the training sets
y_test_pred_br = knn_breed.predict(x_test_br)
y_test_pred_sit = knn_situation.predict(x_test_sit)
y_test_pred_com = knn_combined.predict(x_test_com)
y_test_pred_sh = knn_shorthair.predict(x_test_sh)
y_test_pred_mc = knn_mainecoon.predict(x_test_mc)

## Compute confusion matrix for each model
conf_mat_breed = confusion_matrix(y_test_br, y_test_pred_br)
conf_mat_situation = confusion_matrix(y_test_sit, y_test_pred_sit)
conf_mat_combined = confusion_matrix(y_test_com, y_test_pred_com)
conf_mat_shorthair = confusion_matrix(y_test_sh, y_test_pred_sh)
conf_mat_mainecoon = confusion_matrix(y_test_mc, y_test_pred_mc)

## Compute accuracy for the model
acc_breed = accuracy_score(y_test_br, y_test_pred_br)
acc_situation = accuracy_score(y_test_sit, y_test_pred_sit)
acc_combined = accuracy_score(y_test_com, y_test_pred_com)
acc_shorthair = accuracy_score(y_test_sh, y_test_pred_sh)
acc_mainecoon = accuracy_score(y_test_mc, y_test_pred_mc)

print('Breed accuracy:', acc_breed)
print('Situation accuracy:', acc_situation)
print('Combined accuracy:', acc_combined, '( Breed * Situation =', acc_breed*acc_situation,')')
print('(Situation | European Shorthair) accuracy:', acc_shorthair)
print('(Situation | Maine Coon) accuracy:', acc_mainecoon)
print()
print('Breed confusion matrix:')
print(conf_mat_breed)
print()
print('Situation confusion matrix:')
print(conf_mat_situation)
print()
print('Combined confusion matrix:')
print(conf_mat_combined)
print()
print('(Situation | European Shorthair) confusion matrix:')
print(conf_mat_shorthair)
print()
print('(Situation | Maine Coon) confusion matrix:')
print(conf_mat_mainecoon)

Breed accuracy: 0.9829545454545454
Situation accuracy: 0.7215909090909091
Combined accuracy: 0.7329545454545454 ( Breed * Situation = 0.7092910640495868 )
(Situation | European Shorthair) accuracy: 0.7425742574257426
(Situation | Maine Coon) accuracy: 0.7763157894736842

Breed confusion matrix:
[[99  0]
 [ 3 74]]

Situation confusion matrix:
[[35 10  9]
 [ 6 23  4]
 [10 10 69]]

Combined confusion matrix:
[[19  0  5  0  4  0]
 [ 0 18  0  4  0  4]
 [ 3  1 15  1  3  0]
 [ 2  0  0  7  0  1]
 [ 7  0  2  0 39  0]
 [ 0  3  1  6  0 31]]

(Situation | European Shorthair) confusion matrix:
[[19  4 10]
 [ 3 10  2]
 [ 3  4 46]]

(Situation | Maine Coon) confusion matrix:
[[12  4  4]
 [ 3 12  3]
 [ 2  1 35]]


# A couple data points for X_specaug1

Here are a few clips of data from the best dataset, which was X_specaug1:
    
k=1:

Breed accuracy: 1.0

Situation accuracy: 0.9375

Combined accuracy: 0.9375 ( Breed * Situation = 0.9375 )

(Situation | European Shorthair) accuracy: 0.9405940594059405

(Situation | Maine Coon) accuracy: 0.9605263157894737

Breed confusion matrix:
[[99  0]
 [ 0 77]]

Situation confusion matrix:
[[50  1  3]
 [ 0 32  1]
 [ 2  4 83]]

Combined confusion matrix:
[[27  0  0  0  1  0]
 [ 0 23  0  1  0  2]
 [ 0  0 23  0  0  0]
 [ 0  0  0  9  0  1]
 [ 2  0  0  0 46  0]
 [ 0  0  0  4  0 37]]

(Situation | European Shorthair) confusion matrix:
[[29  2  2]
 [ 1 14  0]
 [ 0  1 52]]

(Situation | Maine Coon) confusion matrix:
[[17  1  2]
 [ 0 18  0]
 [ 0  0 38]]
 
---------------------------------------------
 
k=2:
 
Breed accuracy: 0.9829545454545454

Situation accuracy: 0.8125

Combined accuracy: 0.8068181818181818 ( Breed * Situation = 0.7986505681818181 )

(Situation | European Shorthair) accuracy: 0.8316831683168316

(Situation | Maine Coon) accuracy: 0.881578947368421

Breed confusion matrix:
[[99  0]
 [ 3 74]]

Situation confusion matrix:
[[52  0  2]
 [11 22  0]
 [11  9 69]]

Combined confusion matrix:
[[28  0  0  0  0  0]
 [ 0 24  0  0  0  2]
 [ 9  1 13  0  0  0]
 [ 1  0  1  8  0  0]
 [ 9  0  3  0 36  0]
 [ 0  2  1  5  0 33]]

(Situation | European Shorthair) confusion matrix:
[[30  2  1]
 [ 6  9  0]
 [ 4  4 45]]

(Situation | Maine Coon) confusion matrix:
[[18  0  2]
 [ 3 15  0]
 [ 0  4 34]]
 
-------------------------------------------
 
k=3:
 
Breed accuracy: 0.9772727272727273
 
Situation accuracy: 0.8011363636363636

Combined accuracy: 0.7897727272727273 ( Breed * Situation = 0.7829287190082644 )

(Situation | European Shorthair) accuracy: 0.7920792079207921

(Situation | Maine Coon) accuracy: 0.8157894736842105

Breed confusion matrix:
[[98  1]
 [ 3 74]]

Situation confusion matrix:
[[38  8  8]
 [ 5 24  4]
 [ 5  5 79]]

Combined confusion matrix:
[[22  0  2  1  3  0]
 [ 0 18  0  4  0  4]
 [ 4  0 17  0  2  0]
 [ 2  0  0  6  0  2]
 [ 7  0  1  0 40  0]
 [ 0  0  1  4  0 36]]

(Situation | European Shorthair) confusion matrix:
[[22  3  8]
 [ 3 10  2]
 [ 3  2 48]]

(Situation | Maine Coon) confusion matrix:
[[15  3  2]
 [ 2 12  4]
 [ 2  1 35]]
 
------------------------------------------------------------
 
k=4:
 
Breed accuracy: 0.9829545454545454

Situation accuracy: 0.7215909090909091

Combined accuracy: 0.7329545454545454 ( Breed * Situation = 0.7092910640495868 )

(Situation | European Shorthair) accuracy: 0.7425742574257426

(Situation | Maine Coon) accuracy: 0.7763157894736842

Breed confusion matrix:
[[99  0]
 [ 3 74]]

Situation confusion matrix:
[[35 10  9]
 [ 6 23  4]
 [10 10 69]]

Combined confusion matrix:
[[19  0  5  0  4  0]
 [ 0 18  0  4  0  4]
 [ 3  1 15  1  3  0]
 [ 2  0  0  7  0  1]
 [ 7  0  2  0 39  0]
 [ 0  3  1  6  0 31]]

(Situation | European Shorthair) confusion matrix:
[[19  4 10]
 [ 3 10  2]
 [ 3  4 46]]

(Situation | Maine Coon) confusion matrix:
[[12  4  4]
 [ 3 12  3]
 [ 2  1 35]]