# Homework 2

In [1]:
import os
import numpy as np
import math
import statistics

from sklearn.metrics.pairwise import euclidean_distances, manhattan_distances, cosine_distances
from src.load_json_data import load_json_data

In [8]:
blobs = load_json_data(os.path.join('data', 'blobs.json'))

In [2]:
import numpy as np 
import math

def euclidean_distance(v, w):
    """Returns the Euclidean distance between two vectors"""

    return math.sqrt(sum([(v[ii]-w[ii])**2 for ii in range(len(v))]))

def manhattan_distance(v, w):
    """Returns the Manhattan distance between two vectors"""

    return sum([abs(v[ii]-w[ii]) for ii in range(len(v))])

def euclidean_distances(X, Y):
    """Compute pairwise Euclidean distance between the rows of two matrices X (shape MxK) 
    and Y (shape NxK). The output of this function is a matrix of shape MxN containing
    the Euclidean distance between two rows.
    
    Arguments:
        X {np.ndarray} -- First matrix, containing M examples with K features each.
        Y {np.ndarray} -- Second matrix, containing N examples with K features each.

    Returns:
        D {np.ndarray}: MxN matrix with Euclidean distances between rows of X and rows of Y.
    """
    
    result = np.ndarray((X.shape[0], Y.shape[0]))
    
    for ii in range(X.shape[0]):
        for kk in range(Y.shape[0]):
            
            result[ii,kk] = euclidean_distance(X[ii], Y[kk])
    
    return result


def manhattan_distances(X, Y):
    """Compute pairwise Manhattan distance between the rows of two matrices X (shape MxK) 
    and Y (shape NxK). The output of this function is a matrix of shape MxN containing
    the Manhattan distance between two rows.
    
    Arguments:
        X {np.ndarray} -- First matrix, containing M examples with K features each.
        Y {np.ndarray} -- Second matrix, containing N examples with K features each.

    Returns:
        D {np.ndarray}: MxN matrix with Manhattan distances between rows of X and rows of Y.
    """
    result = np.ndarray((X.shape[0], Y.shape[0]))
    
    for ii in range(X.shape[0]):
        for kk in range(Y.shape[0]):
            
            result[ii,kk] = manhattan_distance(X[ii], Y[kk])
    
    return result



# KNN


In [29]:
X_blobs, y_blobs = load_json_data(os.path.join('data', 'blobs.json'))

In [177]:
import numpy as np 

class KNearestNeighbor():    
    def __init__(self, n_neighbors, distance_measure='euclidean', aggregator='mode'):
        """
        K-Nearest Neighbor is a straightforward algorithm that can be highly
        effective. Training time is...well...is there any training? At test time, labels for
        new points are predicted by comparing them to the nearest neighbors in the
        training data.

        ```distance_measure``` lets you switch between which distance measure you will
        use to compare data points. The behavior is as follows:

        If 'euclidean', use euclidean_distances, if 'manhattan', use manhattan_distances.

        ```aggregator``` lets you alter how a label is predicted for a data point based 
        on its neighbors. If it's set to `mean`, it is the mean of the labels of the
        neighbors. If it's set to `mode`, it is the mode of the labels of the neighbors.
        If it is set to median, it is the median of the labels of the neighbors. If the
        number of dimensions returned in the label is more than 1, the aggregator is
        applied to each dimension independently. For example, if the labels of 3 
        closest neighbors are:
            [
                [1, 2, 3], 
                [2, 3, 4], 
                [3, 4, 5]
            ] 
        And the aggregator is 'mean', applied along each dimension, this will return for 
        that point:
            [
                [2, 3, 4]
            ]

        Arguments:
            n_neighbors {int} -- Number of neighbors to use for prediction.
            distance_measure {str} -- Which distance measure to use. Can be one of
                'euclidean' or 'manhattan'. This is the distance measure
                that will be used to compare features to produce labels. 
            aggregator {str} -- How to aggregate a label across the `n_neighbors` nearest
                neighbors. Can be one of 'mode', 'mean', or 'median'.
        """
        self.n_neighbors = n_neighbors
        self.distance_measure = distance_measure
        self.aggregator = aggregator



    def fit(self, features, targets):
        """Fit features, a numpy array of size (n_samples, n_features). For a KNN, this
        function should store the features and corresponding targets in class 
        variables that can be accessed in the `predict` function. Note that targets can
        be multidimensional! 
        
        Arguments:
            features {np.ndarray} -- Features of each data point, shape of (n_samples,
                n_features).
            targets {[type]} -- Target labels for each data point, shape of (n_samples, 
                n_dimensions).
        """
        self.features = features
        self.targets = targets
        

    def predict(self, features, ignore_first=False):
        """Predict from features, a numpy array of size (n_samples, n_features) Use the
        training data to predict labels on the test features. For each testing sample, compare it
        to the training samples. Look at the self.n_neighbors closest samples to the 
        test sample by comparing their feature vectors. The label for the test sample
        is the determined by aggregating the K nearest neighbors in the training data.

        Note that when using KNN for imputation, the predicted labels are the imputed testing data
        and the shape is (n_samples, n_features).

        Arguments:
            features {np.ndarray} -- Features of each data point, shape of (n_samples,
                n_features).
            ignore_first {bool} -- If this is True, then we ignore the closest point
                when doing the aggregation. This is used for collaborative
                filtering, where the closest point is itself and thus is not a neighbor. 
                In this case, we would use 1:(n_neighbors + 1).

        Returns:
            labels {np.ndarray} -- Labels for each data point, of shape (n_samples,
                n_dimensions). This n_dimensions should be the same as n_dimensions of targets in fit function.
        """
        print(features)
        distance_types = dict()
        distance_types['euclidean'] = euclidean_distances
        distance_types['manhattan'] = manhattan_distances
        
        # A single row are all the distances from a vector in the training data to each vector in the new data
        # A single column are all the distances from a vector in the new data to a vector in the training data
        distances = distance_types[self.distance_measure](self.features, features)
        print(self.features)
        print(distances)
#         labels = np.ndarray((features.shape[0], self.targets.shape[1]))
        labels = list()
        
        for ii in range(features.shape[0]):
            
            # Take all the distances from the ii-th vector in the new data to each of the vectors in the training data
            # enumerate them in tuples (in order to memorize their position in the data and use it to retrieve the appropriate labels from self.targets )
            # and sort them in ascending order by distance
            # take only the self.n_neighbors top tuples (corresponding to the nearest neighbours)
            lowest_distances = sorted(list(enumerate(distances[:,ii])), key = lambda x: x[1])[ignore_first:self.n_neighbors+ignore_first]
            lowest_indices = [distance_tuple[0] for distance_tuple in lowest_distances ]
            
            lowest_labels = self.targets[lowest_indices,]
            
            print("LOWEST_DISTANCES")
            print(lowest_distances)
            print("LOWEST INDICES")
            print(lowest_indices)
            print("TARGETS")
            print(self.targets)
            print("LOWEST LABELS ")
            print(lowest_labels)
            
            if self.aggregator == 'median':
                this_vector_labels = np.median(lowest_labels, axis=0)
                
            if self.aggregator == 'mean':
                this_vector_labels = np.mean(lowest_labels, axis=0)
                
            if self.aggregator == 'mode':
                print("SELF.TARGETS.SHAPE: ", self.targets.reshape(-1,1).shape)
                print("SELF.TARGETS.SHAPE[1]: ", self.targets.reshape(-1,1).shape[1])
#                 this_vector_labels = [statistics.mode(lowest_labels[:, kk]) for kk in range(self.targets.reshape(-1,1).shape[0])]
                if lowest_labels.ndim != 1:
                    this_vector_labels = list()
                    for kk in range(lowest_labels.shape[1]):
                        this_vector_labels.append(statistics.mode([row[kk] for row in lowest_labels]))
                else:
                    this_vector_labels = statistics.mode(lowest_labels)
    

            print(f'PREDICTION for {ii}-th vector is: {this_vector_labels}')    
            print(this_vector_labels)
            labels.append(this_vector_labels)
        
        return np.array(labels)


In [178]:
X_train = np.array([[1,2,3], [4,5,6], [7,8,9], [10,11,12]])
y_train = np.array([[5,8],[5,10], [7,10], [6,9]])

X_test = np.array([[3,6,8], [3,5,6]])

In [179]:
knn = KNearestNeighbor(3, distance_measure='manhattan' )# ,aggregator='median'
knn.fit(X_train, y_train)

In [180]:
# np.array([[10, 15],
#         [13,17]]).reshape(1,-1).flatten()

In [181]:
knn.predict(X_test) #ignore_first=True

[[3 6 8]
 [3 5 6]]
[[ 1  2  3]
 [ 4  5  6]
 [ 7  8  9]
 [10 11 12]]
[[11.  8.]
 [ 4.  1.]
 [ 7. 10.]
 [16. 19.]]
LOWEST_DISTANCES
[(1, 4.0), (2, 7.0), (0, 11.0)]
LOWEST INDICES
[1, 2, 0]
TARGETS
[[ 5  8]
 [ 5 10]
 [ 7 10]
 [ 6  9]]
LOWEST LABELS 
[[ 5 10]
 [ 7 10]
 [ 5  8]]
SELF.TARGETS.SHAPE:  (8, 1)
SELF.TARGETS.SHAPE[1]:  1
PREDICTION for 0-th vector is: [5, 10]
[5, 10]
LOWEST_DISTANCES
[(1, 1.0), (0, 8.0), (2, 10.0)]
LOWEST INDICES
[1, 0, 2]
TARGETS
[[ 5  8]
 [ 5 10]
 [ 7 10]
 [ 6  9]]
LOWEST LABELS 
[[ 5 10]
 [ 5  8]
 [ 7 10]]
SELF.TARGETS.SHAPE:  (8, 1)
SELF.TARGETS.SHAPE[1]:  1
PREDICTION for 1-th vector is: [5, 10]
[5, 10]


array([[ 5, 10],
       [ 5, 10]])

In [108]:
import statistics
statistics.mode([1,1,2])

1

In [163]:
arr = np.array([[15 ,20],
 [7 ,13], [7 ,13]])
arr

array([[15, 20],
       [ 7, 13],
       [ 7, 13]])

In [164]:
for row in arr:
    print(row)

[15 20]
[ 7 13]
[ 7 13]


In [165]:
arr.shape

(3, 2)

In [171]:
np.array(np.array([15 ,20]))

array([15, 20])

In [167]:
if arr.ndim != 1:
    this_vector_labels = list()
    for kk in range(arr.shape[1]):
        this_vector_labels.append(statistics.mode([row[kk] for row in arr]))
        
this_vector_labels

[7, 13]