# K Nearest Neighbors Model

A simple classification model. This notebook can be used as a template for others.

# Imports

In [9]:
## Standard imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from seaborn import set_style

set_style("whitegrid")

In [10]:
## More specific imports for this notebook

import joblib ## For saving trained models
from sklearn.neighbors import KNeighborsClassifier ## Import the model here
from sklearn.model_selection import train_test_split ## Import train_test_split
from sklearn.metrics import confusion_matrix ## Import confusion_matrix

# Initial Settings

In [11]:
data_fp = '../../data/processed_data/specgram_db.npy' ## Import raw wave data

# Load Data

In [17]:
# Load data (currently without classifications attached)
x_data = np.array(np.load(data_fp))

## x_data currently contains 2-dim array entries
## We transform these to vectors in order to use KNeighborsClassification
temp_data = []
for x in x_data:
    temp_data.append(x.flatten())
x_data = temp_data
print(x_data[0])

[30.929146 32.63786  24.906853 ... -1.       -1.       -1.      ]


In [82]:
# Classifications actual classifications
## For now, this is a random list
rng = np.random.default_rng(seed=12987)
y_data = rng.integers(0, 3, size=len(x_data))

# Train-Test Split

In [90]:
## Set up the train test split

# Use these variables to automate saving runs with different filesnames
test_size = 1/5
random_state = 440
x_train, x_test, y_train, y_test = train_test_split(x_data.copy(), y_data,
                                        shuffle = True,
                                        random_state = 440,
                                        test_size=test_size)

# Fit Model

In [116]:
## Use this variable to automate saving runs with different filesnames
k = 5

## Make the model object
knn = KNeighborsClassifier(k)

## "Fit" the model object
knn.fit(x_train, y_train)

# Assess Model Performance
Do things like test accuracy, etc.

In [117]:
## predict on the training set
y_test_pred = knn.predict(x_test)

## Compute confusion matrix for model
conf_mat = confusion_matrix(y_test, y_test_pred)

## Compute accuracy for the model
acc = 0
for i in range(0,len(conf_mat[0])):
    acc += conf_mat[i][i]/len(y_test)

## Compute precisions and recalls for model
prec_vec, recall_vec = [], []
for i in range(0,len(conf_mat[0])):
    prec_vec.append(conf_mat[i][i] / sum(conf_mat[i,:]))
    recall_vec.append(conf_mat[i][i] / sum(conf_mat[:,i]))

# Save Trained Model

In [118]:
# Build model_filename based on characteristics of test

model_filename = '../../data/trained_models/' ## Save location destination
model_filename += ('k'+str(k)) ## Save k-value used for model
model_filename += ('s'+str(test_size)) ## Save test_size used for train test split
model_filename += ('r'+str(random_state)) ## Save random_state used for train test split
model_filename += '.pkl'
print(model_filename)

# Save the model to disk
joblib.dump(knn, model_filename)

../../data/trained_models/k5s0.2r440.pkl


['../../data/trained_models/k5s0.2r440.pkl']

# Save Diagnostic Analysis

In [111]:
## Save the previously computed diagnostic data associated to this model

## Create a filename that corresponds to that of the model
data_filename = '../../data/trained_models/'
data_filename += ('k'+str(k)) ## Save k-value used for model
data_filename += ('s'+str(test_size)) ## Save test_size used for train test split
data_filename += ('r'+str(random_state)) ## Save random_state used for train test split

## Save the confusion matrix and all other desired data plots

## NOTE FOR LATER: Using k as a free parameter, create diagnostic plots