## K Nearest Neighbors

In [94]:
import numpy as np

In [95]:
def classify_response(val):
    """Parse response column from csv."""
    return np.where(val == 'Abnormal', 1, 0)

In [96]:
def scale_mean(df, fitted_stats=None):
    """Pass in df of features. Scale each column by subtracting 
    its mean and dividing by its standard deviation. Return an array 
    with these statistics for each column. When making predictions, can pass
    in fitted_stats to scale using the training data.
    """
    if fitted_stats is not None:
        means, stds = fitted_stats['means'], fitted_stats['stds']
    else:
        means = np.mean(df, axis=0)
        stds = np.std(df, axis=0)
        fitted_stats = dict(means=means, stds=stds)
    df = (df - means) / stds
    return df, fitted_stats

In [97]:
def scale_minmax(df, fitted_stats=None):
    """Pass in df of features. Scale each column so its values 
    lie between 0 and 1. Return an array with min and max for each
    column. When making predictions, can pass in fitted_stats to scale using
    the training data.
    """
    if fitted_stats is not None:
        mins, maxes = fitted_stats['mins'], fitted_stats['maxes']
    else:
        mins = np.min(df, axis=0)
        maxes = np.max(df, axis=0)
        fitted_stats = dict(mins=mins, maxes=maxes)
    df = (df - mins) / (maxes - mins)
    return df, fitted_stats

In [98]:
def row_euclidean(v1, v2):
    """Compute row-wise euclidean distance between two arrays (rank 2).
    Output is an array with a row for each row in v1, where each value in a
    row is its euclidean distance to each row in v2.
    
    For KNN, v1 should typically be new data for making predictions, while v2
    will contain all the potential neighbors.
    """
    output = np.zeros((v1.shape[0], v2.shape[0]))
    for i, row in enumerate(v1):
        output[i] = np.sqrt(np.sum((v2 - row)**2, axis=1))
    return output

In [99]:
def row_cosine_distance(v1, v2):
    """Find row-wise cosine distance between two vectors. Output array has 1 
    row for each row in v1, where the values are distances to each row in 
    v2. Both input vectors should be rank 2.
    """
    output = np.zeros((v1.shape[0], v2.shape[0]))
    v_zeros = np.zeros((1, v1.shape[1]))
    v1_norms = row_euclidean(v1, v_zeros)
    v2_norms = row_euclidean(v2, v_zeros)
    for i, row in enumerate(v1):
        output[i] = 1 - (np.sum(row * v2, axis=1) 
                         / (v1_norms[i] * v2_norms).flatten())
    return output

In [100]:
def mode(classes):
    """Find the most common class from a list or array of numbers."""
    class_counts = [(c, list(classes).count(c)) for c in set(classes)]
    return sorted(class_counts, key=lambda x: x[1], reverse=True)[0][0]

In [108]:
class KNN_classifier():
    """Creates a K Nearest Neighbors classifier.
    
    Attributes:
    -----------
    X_train : ndarray
        Array of features.
    y_train : ndarray
        Array of labels.
    k : int
        # of neighbors to return.
    distance : str
        One of ('euclidean', 'cosine'). 'Euclidean' computes the straight-line
        distance between points. 'Cosine' distance is 1 - cosine similarity
        and may be more appropriate in high dimensional spaces.
    scale_method : str
        One of ('mean', 'minmax'). 'Mean' standardizes the features to have
        mean 0 and standard deviation 1. 'Minmax' normalizes features to lie
        between 0 and 1. If None, no scaling will be done (not recommended).

    """
    
    def __init__(self, X_train, y_train, k=5, scale_method='mean', 
                 distance='euclidean'):
        self.X = X_train
        self.y = y_train
        self.k = k
        self.scale_method = scale_method
        self.distance = distance
        if self.scale_method:
            self.X_scaled, self.stats = self._scale_features(self.X)

    def _scale_features(self, df, stats=None):
        """Scale features using specified method."""
        if self.scale_method == 'mean':
            return scale_mean(df, fitted_stats=stats)
        elif self.scale_method == 'minmax':
            return scale_minmax(df, fitted_stats=stats)
            
    def _find_neighbors(self):
        """Finds indices, distances, and classes for the k nearest 
        neighbors.
        """
        self.nearest_neighbors = np.argsort(self.distances, axis=1)[:, :self.k]
        self.neighbor_distances = np.sort(self.distances, axis=1)[:,:self.k]
        self.neighbor_classes = np.array([self.y[row] for row in 
                                          self.nearest_neighbors])
        
    def _compute_certainty(self):
        """Using nearest neighbors, finds predicted class and class 
        certainty.
        """
        self.predicted_class = [mode(row) for row in self.neighbor_classes]
        self.class_certainty = [np.mean(classes == pred) for classes, pred in 
                                zip(self.neighbor_classes, self.predicted_class)]

    def predict(self, X_new, k=5):
        """Pass in 2d array of x values. Return indices and distances of
        k nearest neighbors from the train set."""
        if self.scale_method:
            X_new_scaled, _ = self._scale_features(X_new, stats=self.stats)
        if self.distance == 'euclidean':
            self.distances = row_euclidean(X_new_scaled, self.X_scaled)
        elif self.distance == 'cosine':
            self.distances = row_cosine_distance(X_new_scaled, self.X_scaled)
        self._find_neighbors()
        self._compute_certainty()
        
        return dict(predicted_class=self.predicted_class,
                    class_certainty=self.class_certainty,
                    nearest_neighbors=self.nearest_neighbors,
                    neighbor_classes=self.neighbor_classes,
                    neighbor_distances=self.neighbor_distances)
    
    def __str__(self):
        """Display object details when printed."""
        return (f'KNN_classifier(k={self.k}, distance={self.distance},'
                f' scale_method={self.scale_method})')
    
    def __repr__(self):
        """Display object details when called in interactive mode."""
        return self.__str__()

### Fit a model using mean standardization and euclidean distance

In [109]:
X = np.loadtxt('bio.csv', skiprows=1, delimiter=',', usecols=range(6))
y = np.loadtxt('bio.csv', skiprows=1, delimiter=',', usecols=6, dtype=np.str)
y = classify_response(y)

In [116]:
mod = KNN_classifier(X_train=X, y_train=y, scale_method='mean',
                     distance='euclidean')
mod

KNN_classifier(k=5, distance=euclidean, scale_method=mean)

In [117]:
np.random.seed(1)
X_new = np.mean(X, axis=0) * np.random.rand(1, 1) * 3
pred1 = mod.predict(X_new)
pred1

{'class_certainty': [0.6],
 'nearest_neighbors': array([[ 83, 226, 275, 200,  60]]),
 'neighbor_classes': array([[1, 0, 0, 1, 1]]),
 'neighbor_distances': array([[1.24555067, 1.2904598 , 1.49994867, 1.65254366, 1.66789103]]),
 'predicted_class': [1]}

In [118]:
np.random.seed(1)
X_multi = np.median(X, axis=0) * np.random.rand(5, 6) * 3
pred_multi = mod.predict(X_multi)
pred_multi

{'class_certainty': [1.0, 1.0, 0.6, 1.0, 1.0],
 'nearest_neighbors': array([[167,   0,  51, 209, 149],
        [226, 293, 237, 275, 221],
        [298, 118, 275, 277,  89],
        [195, 142, 171,  96, 193],
        [206,  51, 191, 122, 162]]),
 'neighbor_classes': array([[1, 1, 1, 1, 1],
        [0, 0, 0, 0, 0],
        [0, 1, 0, 0, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1]]),
 'neighbor_distances': array([[4.19763127, 4.34629258, 4.5234966 , 4.54208665, 4.57139749],
        [2.48358947, 2.55509978, 2.63127072, 2.63977651, 2.86708796],
        [4.62450306, 4.80780351, 4.91718746, 4.98573317, 5.00267267],
        [5.38978908, 5.70179844, 5.83418211, 5.85219075, 5.91739819],
        [6.14890789, 6.32010408, 6.34158484, 6.54352765, 6.61925189]]),
 'predicted_class': [1, 0, 0, 1, 1]}

### Fit a model using min-max normalization and cosine distance

In [119]:
mod2 = KNN_classifier(X, y, 3, 'minmax', 'cosine')
mod2

KNN_classifier(k=3, distance=cosine, scale_method=minmax)

In [120]:
pred2 = mod2.predict(X_new)
pred2

{'class_certainty': [0.6666666666666666],
 'nearest_neighbors': array([[ 83, 275, 237]]),
 'neighbor_classes': array([[1, 0, 0]]),
 'neighbor_distances': array([[0.00283673, 0.00315944, 0.00327326]]),
 'predicted_class': [0]}

In [121]:
mod2.predict(X_multi)

{'class_certainty': [1.0, 1.0, 1.0, 1.0, 1.0],
 'nearest_neighbors': array([[162, 206,  51],
        [272, 308, 295],
        [ 40,  49,   1],
        [112, 125,   9],
        [162, 206, 191]]),
 'neighbor_classes': array([[1, 1, 1],
        [0, 0, 0],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1]]),
 'neighbor_distances': array([[0.16982651, 0.19673849, 0.20557966],
        [0.02752439, 0.03190742, 0.03272664],
        [0.06378024, 0.08717138, 0.0916856 ],
        [0.11870047, 0.11914467, 0.12030607],
        [0.17775531, 0.19182757, 0.22049776]]),
 'predicted_class': [1, 0, 1, 1, 1]}

## Scikit-Learn Implementation

In [122]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

In [123]:
df = pd.read_csv('bio.csv')
X = df.iloc[:, :-1]
y = (df['class'] == 'Abnormal') * 1
df.head()

Unnamed: 0,pelvic_incidence,pelvic_tilt numeric,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis,class
0,63.027818,22.552586,39.609117,40.475232,98.672917,-0.2544,Abnormal
1,39.056951,10.060991,25.015378,28.99596,114.405425,4.564259,Abnormal
2,68.832021,22.218482,50.092194,46.613539,105.985135,-3.530317,Abnormal
3,69.297008,24.652878,44.311238,44.64413,101.868495,11.211523,Abnormal
4,49.712859,9.652075,28.317406,40.060784,108.168725,7.918501,Abnormal


In [124]:
scaler = StandardScaler()
sk_knn = KNeighborsClassifier(n_neighbors=5)

In [125]:
X_scaled = scaler.fit_transform(X)
X_new_scaled = scaler.transform(X_new)
sk_knn.fit(X_scaled, y)
pred = sk_knn.predict(X_new_scaled)[0]
distances, kneighbors = sk_knn.kneighbors(X_new_scaled)

In [126]:
print(f'Predicted class: {pred}')
print(f'Nearest neighbors: {kneighbors.flatten()}')
print(f'Neighbor distances: {distances.flatten()}')

Predicted class: 1
Nearest neighbors: [ 83 226 275 200  60]
Neighbor distances: [1.24555067 1.2904598  1.49994867 1.65254366 1.66789103]
