In [5]:
import random
import math
import time
import csv
from sklearn.cluster import KMeans
import numpy as np

In [50]:
"""kmeans takes in a list of lists, which represents a list of data
points, an integer indicating the number of clusters, a threshold
value, a float, of when to stop running the algorithm, and another
integer, representing the number of dimensions of the data points. Kmeans
outputs a list of clusters, which are represented as lists of data,
which are represented as lists."""
def kmeans2(observations, numclusters, threshold, maxiteration=300):
    """Initializes all necessary placeholders and other variables,
    and starts timing. centers will contain the centers of clusters,
    bign is the number of observations, numchanges is number of changes, and
    since the algorithm is going through the data the first time, the
    number of changes is the number of data. mindist is the minimum
    distance that a center is from a given point. member is the membership
    of the current data point being investigated, clusters is the final
    list of clusters that will be output."""
    ndim = len(observations[0])
    bign = len(observations)
    numchanges = bign
    membership = []
    centertotal = []
    numdataperclust = []
    centers = []

    '''Select data points at regular intervals for initial centers.'''
    stride = len(observations) // numclusters
    for i in range(0, len(observations), stride):
        centers.append(observations[i])
    
    """intializes the members to be -1 so that the first run through
    will have a high number of changes so the while loop will run."""
    for i in range(0, bign):
        membership.append(-1)
        
    numdataperclust = []
    for i in range(0,numclusters):
        numdataperclust.append(0)
        centertotal.append([])
        for j in range(0,dim):
            centertotal[i].append(0.0)

    nloops = 0
    
    """Runs a while loop as long as the proportion of the number of
    changes is larger than the user-input threshold value. Then resets
    clusters to be empty and adds in the appropriate lists that represent
    each cluster. Also resets the number of changes back to 0."""
    while 1.0 * numchanges / bign > threshold and nloops < maxiteration:
        for i in range(0,numclusters):
            numdataperclust[i] = 0
            for j in range(0, ndim):
                centertotal[i][j] = 0
        nloops += 1
        numchanges = 0
        """Runs a for loop through the number of data, and marks the
        distance between that data point and the first center as the
        minimum and sets its membership to 0, so that we have a value
        we can compare the rest to."""
        for i in range(0, bign):
            mindist = distance(observations[i], centers[0])
            member = 0
            """Runs a for loop through the number of clusters/the
            number of centers, so that we can check the distance between
            the data point in question and every cluster center, and if
            a center is closer than the present closest one, reset the
            smallest distance and candidate membership."""
            for j in range(1, numclusters):
                newdist = distance(observations[i], centers[j])
                if newdist < mindist:
                    mindist = newdist
                    member = j
            numdataperclust[member] += 1
            '''Aggregate new data point to new cluster center.'''
            for j in range(0,ndim):
                centertotal[member][j] += observations[i][j]
            """if the membership has changed, update the membership and
            add 1 to the number of changes, and then append the data point
            to the correct cluster."""
            if member != membership[i]:
                numchanges += 1
                membership[i] = member
        """update the cluster centers once the clusters have been created."""
        updateclustercenter(observations, centertotal, numdataperclust, numclusters, centers)
                
    return centers, membership, numdataperclust

In [43]:
"""updateclustercenter takes a list of list of lists, data, which is a 
list that contains lists, each of which represents one cluster, and in
those sub-lists are lists representing data. updateclustercenter also
takes the dimensions of the data as an input, an integer. Then it returns
a new list of centers that are the average of the data in each cluster."""
def updateclustercenter(observations, centertotal, numdataperclust, numclusters, centers):
    """Initialize the centers list to be empty, and the average to be 0."""
    ndim = len(observations[0])
    
    for i in range(0, numclusters):
        if numdataperclust[i] == 0:
            # If there is no data in the cluster, pick random data point as center.
            centers[i] = observations[random.randint(0, len(observations) - 1)]
        else:
            for j in range(0,ndim):
                centers[i][j] = centertotal[i][j] / numdataperclust[i]
                
    #print "number of data per clusters: ", numdataperclust
    #print "centers: ", centers

In [31]:
"""Distance takes two data points, data1 and data2, both of which are lists,
and an integer representing the number of dimensions, and calculates the
square of the distance betwee those two points."""
def distance(data1, data2):
    """sets distance to 0, so we can start accumulating later."""
    ndim = len(data1)
    distance1 = 0.0
    
    """for every dimension, square the difference between the two data
    points, and add it to distance to get the square of the distance."""
    for i in range(0,ndim):
        distance1 += (data1[i] - data2[i]) ** 2
        
    return distance1

In [61]:
def output(observations, centers, membership, numdataperclust):
    writ = []
    writ2 = []
    
    for i in range(0, len(membership)):
        writ.append([membership[i]])
        
    for i in range(0, len(centers)):
        writ2.append(centers[i].append(numdataperclust[i]))
            
    with open("kmeanscenters.csv", 'wb') as resultFile:
        wr = csv.writer(resultFile)
        wr.writerows(centers)
    
    with open("kmeanslabels.csv",'wb') as resultFile:
        wr = csv.writer(resultFile)
        wr.writerows(writ)

In [33]:
def getinput(filename):
    global dim
    
    data = []
    onedata = []
    temp = ''
    
    f = open(filename, 'r')
    initdata = f.readlines()
    
    for i in range(0, len(initdata)):
        j = 0
        while initdata[i][j] != '\r':
            if initdata[i][j] == ',':
                onedata.append(float(temp))
                temp = ''
            else:
                temp += initdata[i][j]
            j += 1
        onedata.append(float(temp))
        data.append(onedata)
        onedata = []
        temp = ''
        
    return data

In [62]:
obs = getinput('output.csv')
start = time.time()
centers, membership, numdataperclust = kmeans2(obs, 12, 0.01, 1)
print 'Run Time: ', time.time() - start
output(obs, centers, membership, numdataperclust)

Run Time:  0.935642957687


In [37]:
start = time.time()
data = []
getinput('output.csv')
print time.time() - start
start = time.time()
kmeans = KMeans(n_clusters=12, random_state=0, max_iter=1).fit(data)
end = time.time()
print 'Time: ', end - start
print kmeans.labels_
print kmeans.cluster_centers_
print kmeans.get_params(deep=True)

1.16072511673
Time:  0.449351072311
[3 1 8 ..., 4 1 9]
[[ 1367.73332841    84.13711582   234.97533221]
 [  509.21153231  1819.23130272   923.07944303]
 [  306.52388059   987.97716013  1867.18662329]
 [ 1034.31758983   351.16470022   892.93428614]
 [  354.51669248  1051.12263605   421.36711586]
 [  913.30851226   826.54750281  1459.54575423]
 [  388.06369215  1913.96386409    42.40049122]
 [ 1440.00014908   828.72003217  1039.41254967]
 [  352.43151641    94.41406234   885.15315991]
 [   32.93987131   256.84597954   117.06483946]
 [ 1660.2292635    385.3700944    663.24206833]
 [ 1564.75894216  1010.19979548   100.63964072]]
{'n_jobs': 1, 'verbose': 0, 'n_clusters': 12, 'max_iter': 1, 'init': 'k-means++', 'random_state': 0, 'n_init': 10, 'tol': 0.0001, 'precompute_distances': 'auto', 'copy_x': True}
