In [1]:
# import dependencies
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from datetime import datetime
%matplotlib inline

In [260]:
# simulate data
K,D = 3,2
N = int(K*1e3)

X0 = np.random.randn(N//K,2) + np.array([2,2])
X1 = np.random.randn(N//K,2) + np.array([2,-2])
X2 = np.random.randn(N//K,2) + np.array([-2,0])
X = np.vstack(( X0, X1, X2 ))

y = np.array([0]*(N//K) + [1]*(N//K) + [2]*(N//K) )

In [43]:
# KNN class definition
class KNN:
    
    def __init__(self, X, y):
        self.X = X
        self.y = y
    
    def predict(self, X, K, **kwargs):
        
        # decide on classification or regression
        classification = kwargs['classification'] if 'classification' in kwargs else True

        distance = self.__select_distance_metric(**kwargs)
            
        N,D = X.shape
        yhat = np.zeros(N)
        
        if classification:
            X = X.reshape([N,1,D])
            DIST = distance(X)
            idx = DIST[:,1].argsort()
            DIST = DIST[ : , idx ]
            GAMMA = self.__gamma(DIST)
            yhat = stats.mode(self.y[idx])[0]
#             yhat = np.bincount(self.y[idx], weights=GAMMA).argmax(axis=1)
        
        return yhat
            
            
            
            
        
#         if classification:
#             for i in range(N):
#                 dist = np.sum( (self.X - X[i])**2, axis=1 )
#                 idx = dist.argsort()[:K]
#                 d = dist[idx]
#                 gamma = self.__gamma(d)
#                 yhat[i] = np.bincount(self.y[idx], weights=gamma).argmax()
#             return yhat
#         else:
#             for i in range(N):
#                 dist = np.sum( (self.X - X[i])**2, axis=1 )
#                 idx = dist.argsort()[:K]
#                 d = dist[idx]
#                 gamma = self.__gamma(d)
#                 yhat[i] = self.y[idx].dot(gamma)
#             return yhat
    
    #==================#
    # distance metrics #
    #==================#
    
    def __distance_Euclidean(self, X):
        return np.sqrt( np.sum( (self.X - X)**2, axis=2) )
    
    def __distance_Euclidean_squared(self, X):
        return np.sum( (self.X - X)**2, axis=2)
    
    def __distance_Manhattan(self, X):
        return np.sum( np.abs(self.X - X), axis=2 )
    
    def __select_distance_metric(self, **kwargs):
        
        metrics = {
            'euclidean': self.__distance_Euclidean,
            'euclidean_squared': self.__distance_Euclidean_squared,
            'manhattan': self.__distance_Manhattan
        }
        
        distance = kwargs['metric'] if 'metric' in kwargs else 'euclidean_squared'
        distance = distance.lower()
        distance = metrics[distance]
        
        return distance
    
    #==================#
    # weight functions #
    #==================#
    
    def __gamma(self, d, **kwargs):
        ed = -np.exp(d)
        return ed / np.sum(ed, axis=1, keepdims=True)

In [44]:
# instantiate KNN class to test
knn = KNN(X,y)

In [45]:
# fit classification
t_0 = datetime.now()
yhat = knn.predict(X,9)
t_f = datetime.now()

np.mean(y == yhat)
df = (t_f - t_0).seconds
print(f"accuracy: {np.mean(y==yhat)}, time: {(t_f-t_0)}")

old = "accuracy: 0.9693333333333334, time: 0:00:00.779266"

ValueError: object too deep for desired array

In [None]:
# fit regression
t_0 = datetime.now()
yhat = knn.predict(X,9, classification=False)
t_f = datetime.now()

np.mean(y == yhat)
df = (t_f - t_0).seconds
print(f"accuracy: {np.mean(y==yhat)}, time: {(t_f-t_0)}")

In [261]:
t_0 = datetime.now()
y_hat = np.zeros(N)

X1 = X.copy().reshape([N,1,D])
DIST = np.sum( (X - X1)**2, axis=2)
idx = np.argpartition(DIST, K)[:K]
votes = y[idx]
# votes,votes.shape
for i in range(votes.shape[1]):
    y_hat[i] = np.argmax( np.bincount(votes[:,i]) )
# DIST = DIST[idx] # dimentionality problem?
# eD = np.exp(DIST)
# GAMMA = eD / eD.sum(axis=1, keepdims=True)
# yhat = stats.mode(y*GAMMA, axis=0)[1]

t_f = datetime.now()
print(f"accuracy: {np.mean(y==yhat)}, time: {(t_f-t_0)}")
# yhat = np.bincount(self.y[idx], weights=GAMMA).argmax(axis=1)

accuracy: 0.3333333333333333, time: 0:00:00.438443


In [111]:
np.mean(y ==yhat)

0.3333333333333333

In [191]:
A1 = (np.random.randn(5,2) * 100) // 10
A1,A1.shape # obs,feature

(array([[ 18.,  -3.],
        [-11.,   6.],
        [  9.,  -3.],
        [  4.,   7.],
        [ -1.,   7.]]), (5, 2))

In [192]:
A2 = (np.random.randn(5,1,2) * 100) // 10
A2,A2.shape #obs,feature

(array([[[  9.,   6.]],
 
        [[-15.,   6.]],
 
        [[  0.,  -7.]],
 
        [[ -2.,  -3.]],
 
        [[  0.,  -6.]]]), (5, 1, 2))

In [193]:
A3 = A1-A2
A3,A3.shape # pred,obs,feature

(array([[[  9.,  -9.],
         [-20.,   0.],
         [  0.,  -9.],
         [ -5.,   1.],
         [-10.,   1.]],
 
        [[ 33.,  -9.],
         [  4.,   0.],
         [ 24.,  -9.],
         [ 19.,   1.],
         [ 14.,   1.]],
 
        [[ 18.,   4.],
         [-11.,  13.],
         [  9.,   4.],
         [  4.,  14.],
         [ -1.,  14.]],
 
        [[ 20.,   0.],
         [ -9.,   9.],
         [ 11.,   0.],
         [  6.,  10.],
         [  1.,  10.]],
 
        [[ 18.,   3.],
         [-11.,  12.],
         [  9.,   3.],
         [  4.,  13.],
         [ -1.,  13.]]]), (5, 5, 2))

In [194]:
A = np.sum( A3**2, axis=2 )
A,A.shape # each row is the distances from the point being predicted to the points in the training data

(array([[ 162.,  400.,   81.,   26.,  101.],
        [1170.,   16.,  657.,  362.,  197.],
        [ 340.,  290.,   97.,  212.,  197.],
        [ 400.,  162.,  121.,  136.,  101.],
        [ 333.,  265.,   90.,  185.,  170.]]), (5, 5))

In [245]:
K = 2
idx = np.argpartition(A, K)[:K]
idx,idx.shape

(array([[3, 2, 4, 0, 1],
        [1, 4, 3, 2, 0]]), (2, 5))

In [247]:
votes = y[idx]
votes,votes.shape

(array([[0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0]]), (2, 5))

In [248]:
labels_count = np.bincount(votes)
labels_count,labels_count.shape

ValueError: object too deep for desired array