# K Nearest Neighbors

In [None]:
cd ..

In [None]:
from sklearn.datasets import make_blobs
from sklearn.datasets import load_iris
%run lib/imports.py
%matplotlib inline

### Using make blobs to create an artificial data set

In [None]:
blob = make_blobs(n_samples = 10000, n_features=10, centers=25, cluster_std=4)

In [None]:
data = blob[0]
target = blob[1]

In [None]:

plt.scatter(data[:,1], data[:,0], c=target)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

In [None]:
blob_X_tr, blob_X_ts, blob_y_tr, blob_y_ts = train_test_split(data, target, test_size=.5)

In [None]:
knc = KNeighborsClassifier(n_neighbors=1)

In [None]:
from lib.models.KNN import KNN

## Score

In [None]:
knc = KNeighborsClassifier(n_neighbors=5)
knc.fit(blob_X_tr, blob_y_tr)
knc.score(blob_X_ts, blob_y_ts)

In [None]:
mod = KNN(neighbors=5)
mod.fit(blob_X_tr, blob_y_tr)
mod.score(blob_X_ts, blob_y_ts)

In [None]:
model_t = KNN_FAST(neighbors=5)
model_t.fit(blob_X_tr, blob_y_tr)
model_t.score(blob_X_ts, blob_y_ts)

# Performance

In [None]:
%%timeit
knc = KNeighborsClassifier(n_neighbors=1)
knc.fit(blob_X_tr, blob_y_tr)
knc.score(blob_X_ts, blob_y_ts)

In [None]:
%%timeit
mod = KNN()
mod.fit(blob_X_tr, blob_y_tr)
mod.score(blob_X_ts, blob_y_ts)

In [None]:
%%timeit
model_FAST = KNN_FAST()
model_FAST.fit(blob_X_tr, blob_y_tr)
model_FAST.score(blob_X_ts, blob_y_ts)

In [None]:

class KNN_FAST():
    '''K-Nearest-Neighbors-Classifier
    -----------
    '''

    def __init__(self, neighbors=1, distance_metric='minkowski', p=2):
        self.n = neighbors
        self.metric = distance_metric
        self.p = p

    def fit(self, X, y):
        
        self.X_fit, self.y_fit = self.data_check(X,y)
        
    def find_distance(self, X):
        
        distance = np.empty((X.shape[0], self.X_fit.shape[0]))
        for i, x in enumerate(X):
            distance[i,:] = np.sqrt(np.square(self.X_fit - x).sum(axis=1))
        return distance
    
    
    def prediction(self, X, y):
    
        dist = self.find_distance(X)
        dist_ind = dist.argsort()
        nbr_ind = dist_ind[:, 0:self.n]
        nbr_trg = self.y_fit[nbr_ind]
        pred = np.empty(len(nbr_trg))
        
        for i, trg in enumerate(nbr_trg):
            ID, count = np.unique(trg, return_counts=True)
            pred_index = count.argsort()[-1]
            pred[i] = ID[pred_index]
            
        return pred

    def score(self, X, y):
        
        pred = self.prediction(X, y)
        true_false = pred == y
        tf_count = np.bincount(true_false)
        
        return tf_count[1] / (tf_count[0] + tf_count[1])

    def data_check(self, X,y):
        '''
        Converts data to a numpy ndarray.
        '''

        if type(X) != np.ndarray:
            try:
                X = np.array(X)
            except:
                raise Exception('Failed to convert data to np.ndarray')
                
        if type(y) != np.ndarray:
            try:
                y = np.array(y)
            except:
                raise Exception('Failed to convert target to np.ndarray')

        return X, y
    
    def metric_check(self):
    
        if self.metric == 'manhattan':
            if self.p != 1:
                raise ValueError('Manhattan metric has been selected, p value will be set to 1.')
                self.p = 1
        elif self.metric == 'euclidean':
            if self.p != 2:
                raise ValueError('Euclidean metric has been selected, p value will be set to 2.')
                self.p = 2
        elif self.metric == 'minkowski':
            if self.p == 2:
                raise ValueError(
                'A p value of 2 has been selected with the Minkowski metric.\
                This is equivalent to the Euclidean metric.'
                )