### k nearest neighbors example
Chronic kidney disease data

In [None]:
import numpy as np
from datascience import *
import pandas as pd
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import warnings
warnings.simplefilter('ignore', UserWarning)
from IPython.display import Image
from matplotlib.colors import ListedColormap
from sklearn import neighbors, datasets

In [None]:
url = "https://raw.githubusercontent.com/vamshikrishnajr/chronic-kidney-disease-diagnosis/master/kidney_disease.csv"
ckd_db = pd.read_csv(url)
ckd_db.dropna(axis=0,  inplace=True)
ckd_db

In [None]:
ckd_db['Class'] = np.where(ckd_db['classification'] == "ckd", 1, 0)
ckd = Table().from_df(ckd_db)
ckd

In [None]:
def standard_units(xyz):
    "Convert any array of numbers to standard units."
    return (xyz - np.mean(xyz))/np.std(xyz)  

In [None]:
ckd_s = Table().with_columns(
    'Hemoglobin', standard_units(ckd.column('hemo')),
    'Glucose', standard_units(ckd.column('bgr')),
    'White Blood Cell Count', standard_units(ckd.column('wc')),
    'Class', (ckd.column('Class'))
)

In [None]:
ckd_s.stats()

In [None]:
color_table = Table().with_columns(
    'Class', make_array(1, 0),
    'Color', make_array('darkblue', 'gold')
)
ckd_s = ckd_s.join('Class', color_table)

In [None]:
ckd_s

In [None]:
ckd_s.scatter('Hemoglobin', 'Glucose', group='Color')

In [None]:
ckd_s.scatter('White Blood Cell Count', 'Glucose', group='Color')

In [None]:
train, test = ckd_s.split(int(ckd_s.num_rows*0.8))
print(train.num_rows, 'training and', test.num_rows, 'test instances.')

In [None]:
def distance(point1, point2):
    """Returns the Euclidean distance between point1 and point2.
    
    Each argument is an array containing the coordinates of a point."""
    return np.sqrt(np.sum((point1 - point2)**2))

In [None]:
def row_distance(row1, row2):
    """The distance between two rows of a table."""
    return distance(np.array(row1), np.array(row2)) # Need to convert rows into arrays

In [None]:
test_point_row = test.drop("Class","Color").row(9)
train_point_row = train.drop("Class","Color").row(8)

In [None]:
np.array(train_point_row)

In [None]:
row_distance(train_point_row,test_point_row)

In [None]:
test.show(9)

In [None]:
def distances(training, example, output):
    """Compute the distance from example for each row in training."""
    dists = []
    attributes = training.drop(output)
    for row in attributes.rows:
        dists.append(row_distance(row, example))
    return training.with_column('Distance', dists)

def closest(training, example, k, output):
    """Return a table of the k closest neighbors to example."""
    return distances(training, example, output).sort('Distance').take(np.arange(k))

In [None]:
test.row(31)

In [None]:
cl_knn = closest(train.drop("Color"), test.drop("Class","Color").row(31), 8, "Class")
cl_knn

### We can use most frequent value for the class as the prediction (like majority)

In [None]:
print("Most frequent value in the above array:")
print(np.bincount(cl_knn.column("Class")).argmax())

### Prediction
Now lets predict across all test rows and get a % accuracy

In [None]:
correct = 0
k = 20
for i, row in enumerate(test.drop("Color").rows):
    #print(i,row)
    cl_knn = closest(train.drop("Color"), test.drop("Class","Color").row(i), k, "Class")
    print(i," Prediction: ",np.bincount(cl_knn.column("Class")).argmax()," Actual: ",test.column("Class").take(i))
    correct += (np.bincount(cl_knn.column("Class")).argmax()==test.column("Class").take(i))
print("% Correct: ",correct/(i+1)*100)