# Nearest Neighbors Exercises

In [None]:
import numpy as np
import pandas as po
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

## Problem 1

Consider the following simple data-set:

<img src="https://github.com/BeaverWorksMedlytics2020/Data_Public/raw/master/Images/Week1/knn_notebook_example_table.png" alt="Example Table" width="600">

Now consider the Sample:
    $$X= 4, Y = 4, Z = 2$$

Using kNN, what is the class for this sample for $k = 1$ and $k = 3?$ Use the Eucledian metric.

Using kNN, we found that when k = 1 the class is 1, and when k = 2 the class is two.

## Problem 2
Earlier in the tutorial we were told that kNN depends on several factors, one of them being $k$. Consider the following datasets below, find the optimal value of $k$ that gives the highest accuracy. Visualize your data! Can you come up with some rule for getting a good idea of what $k$ is? 

HINT: look for a pattern/bound! Answer should be in terms of the size of the dataset $n$. 

In [None]:
# Solve this problem for each of these datasets
from sklearn.datasets import load_iris 
from sklearn.datasets import load_breast_cancer
from sklearn.datasets import load_wine 

# Load those datasets into some easily accessible variables
# The datasets are already normalized, so that saves us some steps!
iris = load_iris()                    #iris dataset: size = 150
breast_cancer = load_breast_cancer()  #breast cancer dataset: size = 569
wine = load_wine()                    #wine dataset: size 178

# This function will perfom KNN classification for a specified k
def split_train_test_dataset(dataset, k, test_size=0.2):
    """Loads and performs KNN classification on the provided dataset"""
    # Grab and split the dataset
    X_train, X_val, y_train, y_val = train_test_split(
        dataset.data, dataset.target, test_size=test_size, random_state=0)

    # Build a KNN classifier, fit it and test its predictions
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    # print("Validation Accuracy is {:5.1%}".format(
    return accuracy_score(y_val, knn.predict(X_val))*100


In [None]:
split_train_test_dataset(iris, 7) #6-25 : 100% iris

accuracy = split_train_test_dataset(iris, 7)
accuracy

acc_values_iris = []
acc_values_breast_cancer = []
acc_values_wine = []

for i in range(1, 150):
    acc_values_iris.append(split_train_test_dataset(iris, i))

for i in range(1, 569):
    acc_values_breast_cancer.append(split_train_test_dataset(breast_cancer, i))

for i in range(1, 178):
    acc_values_wine.append(split_train_test_dataset(wine, i))

plt.plot(range(1, 100), acc_values_iris,label = 'iris')

plt.plot(range(1, 100), acc_values_breast_cancer,label = 'breast cancer')

plt.plot(range(1, 100), acc_values_wine, label = 'wine')

ValueError: Expected n_neighbors <= n_samples,  but n_samples = 120, n_neighbors = 121

In [None]:
iris

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

Write a single mathematical expression describing the relationship you found between $n$ (the size of the dataset) and $k$ (the number of datapoints used to classify each validation datum).

Mathematical answer = k goes from 0 through the sqrt(n)

## Problem 3
Now, we will **be writing our k-NNA**. Recall that we said a kNN is comprised of a predictions and using those predictions to classify the data. Here we will try to mimic sklearn's kNN methods. We will be using the Pima diabetes dataset. 

### Loading and splitting data

In [None]:
url = "https://github.com/BeaverWorksMedlytics2020/Data_Public/raw/master/NotebookExampleData/Week1/diabetes.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = po.read_csv(url, names=names)

# Dropping NaN rows
invalid = ['plas', 'pres', 'skin', 'test', 'mass']

for i in invalid:
    data[i].replace(to_replace=0, value=np.nan, inplace=True)
    
data = data.dropna(axis=0).reset_index(drop=True)

Now, let's clearly define which columns will act as explanatory variables, and which column will be the target value, and split the dataset between your training data and testing data. Let's try an 80-20 split and use sklearn's [train_test_split](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html) method (set random_state = 0 so we get the same output each time).

In [None]:
# Columns we will use to make predictions with (features!) feel free to play around with these
X_cols = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age']

# Column that we want to predict (the labels)
y_col = 'class'

# 80-20 train-test split of datset
test_size = 0.2

X_train, X_test, y_train, y_test = train_test_split(data[X_cols], data[y_col], test_size=test_size, random_state=0)
# Further split X and y of training into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=test_size, random_state=0)

print(X_train, X_test)

print('There are {} training samples with {} features and {} associated classification labels'.format(*X_train.shape, *y_train.shape))
print('There are {} validation samples with {} features and {} associated classification labels'.format(*X_val.shape, *y_val.shape))
print('There are {} test samples with {} features and {} associated classification labels'.format(*X_test.shape, *y_test.shape))

     preg   plas  pres  skin   test  mass   pedi  age
173     1  139.0  46.0  19.0   83.0  28.7  0.654   22
289     1   87.0  68.0  34.0   77.0  37.6  0.401   24
115     7  181.0  84.0  21.0  192.0  35.9  0.586   51
335     5  123.0  74.0  40.0   77.0  34.1  0.269   28
181     5   99.0  54.0  28.0   83.0  34.0  0.499   30
..    ...    ...   ...   ...    ...   ...    ...  ...
257     7   94.0  64.0  25.0   79.0  33.3  0.738   41
11     10  125.0  70.0  26.0  115.0  31.1  0.205   41
249     1  139.0  62.0  41.0  480.0  40.7  0.536   21
101     0  140.0  65.0  26.0  130.0  42.6  0.431   24
248     0  135.0  68.0  42.0  250.0  42.3  0.365   24

[250 rows x 8 columns]      preg   plas  pres  skin   test  mass   pedi  age
144     2  146.0  70.0  38.0  360.0  28.0  0.337   29
280     8  186.0  90.0  35.0  225.0  34.5  0.423   37
68      2  108.0  52.0  26.0   63.0  32.5  0.318   22
372     5  117.0  86.0  30.0  105.0  39.1  0.251   42
328     3  103.0  72.0  30.0  152.0  27.6  0.730   27
..  

### Normalizing Data

Let's not forget to normalize the data! We'll use sklearn's StandardScaler normalization like we did before to normalize the training **and** validation/data.

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

for i in list(X_train):
    feature_data_train = X_train[i].values.reshape(-1, 1)
    scaler.fit(feature_data_train)
    X_train[i] = scaler.transform(feature_data_train)

for j in list(X_test):
    feature_data_test = X_test[j].values.reshape(-1, 1)
    scaler.fit(feature_data_test)
    X_test[j] = scaler.transform(feature_data_test)
    
for k in list(X_val):
    feature_data_val = X_val[k].values.reshape(-1, 1)
    scaler.fit(feature_data_val)
    X_val[k] = scaler.transform(feature_data_val)

### Writing your kNN

Now for the fun part! Fill in the 3 following methods, euclidean_dist(), predict(), and knn().

The predict method that we'll make below needs to: 
1. Compute the euclidean distance between the “new” observation and all the data points in the training set. 
2. Assign the corresponding label to the observation
3. Select the k nearest ones and perform a "majority vote"

In [None]:
# Euclidean distance function from tutorial
def euclidean_dist(datum1, datum2):
    inner_val = 0.0
    
    for g in range(datum1.shape[0]):
        inner_val += (datum1[g]- datum2[g]) ** 2
    
    distance = np.sqrt(inner_val)
    return(distance)

In [None]:
from collections import Counter

def predict(x_training, y_training, x_test_sample, k):
    
    # Create list for distances and targets
    distances = []
    targets = []
    
    for i in list(x_training.index):
        #distances.append(euclidean_dist(x_training[i], x_test_sample[i]))
        distances.append([euclidean_dist(x_test_sample, x_training.loc[i]), i])
    distances = sorted(distances)
    #print(distances)

    for i in range(k):
        dis = distances[i][1]
        targets.append(y_training.loc[dis])
    c = Counter(targets)
    return c.most_common()[0][0]

In [None]:
def knn(x_training, y_training, x_testing, k): 
    predictions = []
    
    for i in list(x_testing.index):
        predictions.append(predict(x_training, y_training, x_testing.loc[i], k))
    
    return predictions

When done, test your code by running the methods here!

In [None]:
from sklearn.metrics import accuracy_score
import time
start = time.time()
predictions_slow = knn(X_train, y_train, X_val, k=5)


print('Took {} seconds'.format(time.time() - start))
print("Validation Accuracy is ", accuracy_score(y_val,predictions_slow)*100)

Took 4.358816862106323 seconds
Validation Accuracy is  80.95238095238095


Check sklearn's predictions on validation data from the tutorial notebook and make sure they match yours. Sklearn is faster, but you should get the same answers.

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=0e58ff94-2716-4c23-bc14-d1bdf69b13de' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>