## K-Means Clustering
#### Lew Sears

In [1]:
import numpy as np
import pandas as pd

In [91]:
class K_Means_Clustering:
    '''Basic k-means algorithm for cluster analysis. '''
    
    #initialize the hyperparameter k
    def __init__(self, k):
        try:
            if type(k) == int and k >= 1:
                self.k = k
            else:
                raise ValueError('Bad k')
        except ValueError as exp:
            line = "\n---------------------------------------\n"
            print("Value Error:{}Given k = {}. Bad choice my friend!{}k must be a nonzero positive integer.{}\
            ".format(line, k, line, line))
    
    #Find the best centroids to create labels
    def fit(self, df, iterations):
        '''Input a scaled np.array with only numerical columns to be assigned labels. For a pandas dataframe,
        just fit StandardScaler and transform the dataframe. Put in the amount of iterations you desire. The
        iterations will stop if the labels do not change.'''
        
        #We will run through this process based on the set ammount of iterations.
        iteration_counter = 0
        while iteration_counter < iterations:
            
            #Initialize centroids
            if iteration_counter == 0:
                #df is scaled so we just want random points normally distibuted around 0
                centroids = np.random.normal(0 , 1, size = (self.k,df.shape[1]))
            #Update centroids
            else:
                centroids = centroids_update


            #Pick the labels
            closest_centroid = []
            for vec in df:
                distances = np.sum((centroids - vec)**2, axis = 1)
                label = np.argmin(distances)
                closest_centroid.append(label)
            labels = np.array(closest_centroid)

            #Now calculate new centroids
            updates = []
            unique_labels = np.unique(labels)
            for some_label in unique_labels:
                some_label_group = df[labels == some_label]
                
                #Find the average
                try: 
                    center = np.sum(some_label_group, axis = 0)/some_label_group.shape[0]
                    
                #This is basically the case where there are no points with this label
                except:
                    center = np.random.normal(0 , 1, size = (5,)) 
                
                updates.append(center)
            centroids_update = np.array(updates)
            iteration_counter += 1

        return labels

In [92]:
sample_df = np.random.normal(0 , 1, size = (100,5))
sample_df[:5]

array([[-0.36177393,  0.37052257, -0.57885455, -0.39851214, -0.28042686],
       [ 0.36549238, -1.4428312 ,  0.24433438, -0.74179752,  2.1499628 ],
       [-1.36458607,  1.98729012,  0.63874237, -0.561706  , -0.18450669],
       [ 0.15971166,  0.5011711 ,  0.7094652 , -1.3826558 ,  0.04625263],
       [ 0.54021006, -0.4239882 , -0.77850737, -1.66727699,  0.16915566]])

In [93]:
model = K_Means_Clustering(5)

In [97]:
a = model.fit(sample_df, 100)

In [98]:
np.unique(a, return_counts = True)

(array([0, 1, 2, 3, 4]), array([19, 23, 24, 17, 17]))

In [99]:
a

array([2, 1, 3, 2, 0, 1, 0, 3, 4, 1, 4, 0, 1, 2, 1, 2, 3, 4, 2, 0, 2, 0,
       3, 1, 4, 0, 2, 4, 2, 3, 2, 2, 2, 3, 4, 2, 2, 1, 0, 0, 2, 1, 0, 0,
       3, 1, 0, 1, 0, 4, 4, 3, 0, 0, 2, 3, 3, 0, 0, 1, 2, 1, 3, 1, 1, 4,
       2, 4, 2, 1, 0, 1, 3, 4, 2, 4, 4, 3, 1, 1, 1, 1, 2, 3, 4, 1, 0, 4,
       2, 2, 1, 0, 3, 2, 4, 3, 4, 2, 1, 3])