In [26]:
import numpy as np
import numpy.linalg as la
from matplotlib import pyplot as plt
import copy

In [3]:
data = np.array([[1,2,3],[3,4,5],[5,6,4],[4,3,2]])

In [30]:
np.mean(data, axis=0)

array([3.25, 3.75, 3.5 ])

In [42]:
k = 2
np.random.shuffle(data)
data[:k]

array([[5, 6, 4],
       [3, 4, 5]])

In [23]:
class KMeansClassifier:
    """Given some data, determines k means to be used for classification"""
    def __init__(self):
        self.means = []
    
    def train(self, data, k):
        """Run the k-means algorithm, stores the final means in self.means"""
        
        # so we can be destructive
        copy_data = np.copy(data)
        
        # 1. choose k points at random from data
        
        # not necessary if data is all numeric
        # copy_data = copy.deepcopy(data)
        
        np.random.shuffle(copy_data)
        self.means = data[:k] # grab the first k rows after a random shuff
        
        # 2. loop the following until self.means stops changing
        #    - for each self.means as m, gather all rows of copy_data
        #         for which m is the closest mean, call this group_m
        count = 0
        while count < 1000: # want to add a convergence check
            for i in range(len(self.means)):
                group = [] # set of rows in copy_data for which self.means[i] is the closest mean
                           # aggregated group as a matrix, rows are rows of data
                for row in copy_data:
                    distances = []
                    for j in range(len(self.means)):
                        # calculate distance to self.means[i]
                        distances.append( la.norm(row - self.means[j]) )
                    # if min distance is at index i, then np.vstack([ group, row ])
                self.means[i] = np.mean(group, axis=0)
        #    - re-assign self.means[m] = mean(group_m)
    
    
    def test(self, x):
        """Return the mean closest to x among self.means"""
        # find row of self.means that is the closest to x in euclidean distance
        # return it
        pass
    

In [None]:
# when testing the iris data, for instance, know beforehand 
# (in a practical setting, perhaps by consulting with a bonanist),
# that there should be 3 groups, so set k = 3 when training


In [43]:
data

array([[5, 6, 4],
       [3, 4, 5],
       [1, 2, 3],
       [4, 3, 2]])

In [53]:
np.vstack([data,np.array([[1,1,1]]) ])

array([[5, 6, 4],
       [3, 4, 5],
       [1, 2, 3],
       [4, 3, 2],
       [1, 1, 1]])

In [54]:
data

array([[5, 6, 4],
       [3, 4, 5],
       [1, 2, 3],
       [4, 3, 2]])

In [55]:
for row in data:
    print(row)

[5 6 4]
[3 4 5]
[1 2 3]
[4 3 2]
