# KNN

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np 
%matplotlib inline
import scipy as sp
# increase figure size
plt.rcParams['figure.figsize'] = [15, 10]

## KNN Grid Search Results

First, we'll run scale_run.py and grid_search.py in the command line. Then we'll import the raw results of the grid search to identify the relevant columns.

In [2]:
tree_results = pd.read_csv('../tmp/grid_search_knn0.csv').drop(columns=['Unnamed: 0'])
tree_results.head(10)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__algorithm,param_classifier__leaf_size,param_classifier__metric,param_classifier__n_neighbors,param_classifier__p,param_classifier__weights,...,split2_train_accuracy,split3_train_accuracy,split4_train_accuracy,split5_train_accuracy,split6_train_accuracy,split7_train_accuracy,split8_train_accuracy,split9_train_accuracy,mean_train_accuracy,std_train_accuracy
0,0.020488,0.007085,0.023287,0.00729,auto,15,minkowski,3,1,uniform,...,0.684644,0.701873,0.702096,0.700599,0.696856,0.693862,0.696856,0.686377,0.695792,0.006439
1,0.016989,0.003345,0.023487,0.010155,auto,15,minkowski,3,1,distance,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
2,0.01509,0.002299,0.013893,0.0007,auto,15,minkowski,3,2,uniform,...,0.686142,0.700375,0.693114,0.697605,0.693114,0.695359,0.700599,0.687874,0.69377,0.005771
3,0.014791,0.001989,0.014347,0.001882,auto,15,minkowski,3,2,distance,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
4,0.017389,0.005621,0.01919,0.005397,auto,15,minkowski,5,1,uniform,...,0.606742,0.621723,0.624251,0.622006,0.616018,0.611527,0.610778,0.615269,0.615528,0.007303
5,0.01649,0.0028,0.01869,0.001845,auto,15,minkowski,5,1,distance,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
6,0.017088,0.001969,0.021293,0.007491,auto,15,minkowski,5,2,uniform,...,0.605993,0.628464,0.622006,0.616018,0.610778,0.601796,0.608533,0.620509,0.612758,0.008387
7,0.016789,0.00256,0.021788,0.005773,auto,15,minkowski,5,2,distance,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
8,0.01709,0.003046,0.01929,0.002721,auto,15,minkowski,7,1,uniform,...,0.569288,0.579775,0.582335,0.57485,0.55988,0.56512,0.573353,0.591317,0.57315,0.009415
9,0.019588,0.00777,0.02019,0.002924,auto,15,minkowski,7,1,distance,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


## KNN Step 0 Scores

Now we will narrow down the relevant columns and sort by accuracy score.

In [3]:
columns = ['rank_test_accuracy', 
           'mean_test_accuracy', 'mean_train_accuracy',
           'std_test_accuracy', 'std_train_accuracy',
           'param_classifier__n_neighbors', 'param_classifier__weights', 'param_classifier__algorithm',
           'param_classifier__leaf_size', 'param_classifier__p',
           'param_classifier__metric']
tree_results.sort_values('rank_test_accuracy')[columns].head(30)

Unnamed: 0,rank_test_accuracy,mean_test_accuracy,mean_train_accuracy,std_test_accuracy,std_train_accuracy,param_classifier__n_neighbors,param_classifier__weights,param_classifier__algorithm,param_classifier__leaf_size,param_classifier__p,param_classifier__metric
546,1,0.474394,0.695118,0.043879,0.005973,3,uniform,brute,30,2,minkowski
578,1,0.474394,0.695118,0.043879,0.005973,3,uniform,brute,40,2,minkowski
514,1,0.474394,0.695118,0.043879,0.005973,3,uniform,brute,20,2,minkowski
482,1,0.474394,0.695118,0.043879,0.005973,3,uniform,brute,15,2,minkowski
610,1,0.474394,0.695118,0.043879,0.005973,3,uniform,brute,45,2,minkowski
483,6,0.473046,1.0,0.043794,0.0,3,distance,brute,15,2,minkowski
547,6,0.473046,1.0,0.043794,0.0,3,distance,brute,30,2,minkowski
515,6,0.473046,1.0,0.043794,0.0,3,distance,brute,20,2,minkowski
611,6,0.473046,1.0,0.043794,0.0,3,distance,brute,45,2,minkowski
579,6,0.473046,1.0,0.043794,0.0,3,distance,brute,40,2,minkowski


Apparently, the leaf size doesn't have much of an effect on the score, so we will leave it at the default setting of 30.
The results suggest that 3 is the ideal value for the n_neighbors parameter and 2 is the ideal value for the p parameter.

The weights and algorithm parameters are still mixed, so we will fix the other parameters and try again.

## KNN Step 1 Scores

In [4]:
tree_results = pd.read_csv('../tmp/grid_search_knn1.csv')
columns = ['rank_test_accuracy', 
           'mean_test_accuracy', 'mean_train_accuracy',
           'std_test_accuracy', 'std_train_accuracy',
           'param_classifier__n_neighbors', 'param_classifier__weights', 'param_classifier__algorithm',
           'param_classifier__leaf_size', 'param_classifier__p',
           'param_classifier__metric']
tree_results.sort_values('rank_test_accuracy')[columns].head(10)

Unnamed: 0,rank_test_accuracy,mean_test_accuracy,mean_train_accuracy,std_test_accuracy,std_train_accuracy,param_classifier__n_neighbors,param_classifier__weights,param_classifier__algorithm,param_classifier__leaf_size,param_classifier__p,param_classifier__metric
6,1,0.474394,0.695118,0.043879,0.005973,3,uniform,brute,30,2,minkowski
7,2,0.473046,1.0,0.043794,0.0,3,distance,brute,30,2,minkowski
0,3,0.471024,0.693845,0.045648,0.005676,3,uniform,auto,30,2,minkowski
1,3,0.471024,1.0,0.044738,0.0,3,distance,auto,30,2,minkowski
2,3,0.471024,0.693995,0.045648,0.006001,3,uniform,ball_tree,30,2,minkowski
3,3,0.471024,1.0,0.044738,0.0,3,distance,ball_tree,30,2,minkowski
4,3,0.471024,0.693845,0.045648,0.005676,3,uniform,kd_tree,30,2,minkowski
5,3,0.471024,1.0,0.044738,0.0,3,distance,kd_tree,30,2,minkowski


It looks like 'brute' is the best choice for the algorithm and 'uniform' the best choice for weight at this point.

So the final parameters that we will be using for KNN are:  
n_neighbors = 3  
weights = ‘uniform’  
algorithm = ‘brute’  
leaf_size = 30  
p = 2  
metric = ‘minkowski’  

This combination yielded the follow scores:  
mean_train_accuracy = 0.695118  
mean_test_accuracy = 0.474394  
std_test_accuracy = 0.043879  
std_train_accuracy = 0.005973  

The lack of deviation in the test and train scores indicates that the scoring is consistent. The imbalance in the mean scores is concerning though. The mean score of the training data is at about 70%, while the mean score of the test data is at about 50%. This means that the model isn't generalizing enough and is overfitting to a certain degree. 