In [2]:
import random
import math
import time
import csv
from sklearn.cluster import KMeans
import numpy as np

In [217]:
"""kmeans takes in a list of lists, which represents a list of data
points, an integer indicating the number of clusters, a threshold
value, a float, of when to stop running the algorithm, and another
integer, representing the number of dimensions of the data points. Kmeans
outputs a list of clusters, which are represented as lists of data,
which are represented as lists."""
def kmeans2(observations, numclusters, threshold, maxiteration=300, centers=None):
    """Initializes all necessary placeholders and other variables,
    and starts timing. centers will contain the centers of clusters,
    bign is the number of observations, numchanges is number of changes, and
    since the algorithm is going through the data the first time, the
    number of changes is the number of data. mindist is the minimum
    distance that a center is from a given point. member is the membership
    of the current data point being investigated, clusters is the final
    list of clusters that will be output."""
    bign, ndim = observations.shape
    numchanges = bign
    centertotal = np.empty([numclusters, ndim])
    numdataperclust = np.empty([numclusters], dtype=int)
    membership = np.empty([bign], dtype=int)
    membership[:] = -1
    
    '''Select data points at regular intervals for initial centers.'''
    if centers is None:
        centers = choosecenters(observations, numclusters)
        print "picking initial K centers inside kmeans2"
        
    nloops = 0
    
    """Runs a while loop as long as the proportion of the number of
    changes is larger than the user-input threshold value. Then resets
    clusters to be empty and adds in the appropriate lists that represent
    each cluster. Also resets the number of changes back to 0."""
    while 1.0 * numchanges / bign > threshold and nloops < maxiteration:
        numdataperclust[:] = 0
        centertotal[:, :] = 0.0
        nloops += 1
        numchanges = 0
        """Runs a for loop through the number of data, and marks the
        distance between that data point and the first center as the
        minimum and sets its membership to 0, so that we have a value
        we can compare the rest to."""
        for i in range(0, bign):
            mindist = distance(ndim, observations[i], centers[0])
            member = 0
            """Runs a for loop through the number of clusters/the
            number of centers, so that we can check the distance between
            the data point in question and every cluster center, and if
            a center is closer than the present closest one, reset the
            smallest distance and candidate membership."""
            for j in range(1, numclusters):
                newdist = distance(ndim, observations[i], centers[j])
                if newdist < mindist:
                    mindist = newdist
                    member = j
            numdataperclust[member] += 1
            '''Aggregate new data point to new cluster center.'''
            centertotal[member, :] += observations[i, :]
            """if the membership has changed, update the membership and
            add 1 to the number of changes, and then append the data point
            to the correct cluster."""
            if member != membership[i]:
                numchanges += 1
                membership[i] = member
        """update the cluster centers once the clusters have been created."""
        updateclustercenter(observations, centertotal, numdataperclust, centers)
        print "membership[1]=", membership[1]
        print "number of changes:", numchanges
        
    print "nloops=", nloops
                
    return centers, membership, numdataperclust

In [218]:
def choosecenters(observations, numclusters):
    bign, ndim = observations.shape
    centers = np.empty([numclusters, ndim])
    stride = bign // numclusters
    counter = 0
    for i in range(0, bign, stride):
        centers[counter] = observations[i]
        counter += 1
        
    return centers

In [219]:
"""updateclustercenter takes a list of list of lists, data, which is a 
list that contains lists, each of which represents one cluster, and in
those sub-lists are lists representing data. updateclustercenter also
takes the dimensions of the data as an input, an integer. Then it returns
a new list of centers that are the average of the data in each cluster."""
def updateclustercenter(observations, centertotal, numdataperclust, centers):
    """Initialize the centers list to be empty, and the average to be 0."""
    numclusters, ndim = centertotal.shape
    
    for i in range(0, numclusters):
        if numdataperclust[i] == 0:
            # If there is no data in the cluster, pick random data point as center.
            centers[i] = observations[random.randint(0, observations.size - 1)]
        else:
            centers[i,:] = centertotal[i,:] / numdataperclust[i]

In [220]:
"""Distance takes two data points, data1 and data2, both of which are lists,
and an integer representing the number of dimensions, and calculates the
square of the distance betwee those two points."""
def distance(ndim, data1, data2):
    """sets distance to 0, so we can start accumulating later."""
    distance1 = 0.0
    
    """for every dimension, square the difference between the two data
    points, and add it to distance to get the square of the distance."""
    for i in range(0,ndim):
        distance1 += (data1[i] - data2[i]) ** 2
    
    return distance1

In [221]:
def output(observations, centers, membership, numdataperclust):
    writ = []
    writ2 = []
    
    for i in range(0, membership.size):
        writ.append([membership[i]])
        
    for i in range(0, len(centers)):
        writ2.append(np.append(centers[i],numdataperclust[i]))
            
    with open("kmeanscenters.csv", 'wb') as resultFile:
        wr = csv.writer(resultFile)
        wr.writerows(writ2)
    
    with open("kmeanslabels.csv",'wb') as resultFile:
        wr = csv.writer(resultFile)
        wr.writerows(writ)

In [214]:
def output2(observations, centers, membership, numdataperclust):
    x = open('kmeanstest','wb')
    centers.tofile(x)

In [222]:
def getinput(filename):
    temp = ''
    
    f = open(filename, 'r')
    initdata = f.readlines()
    
    i = 0
    ndims = 1
    while initdata[0][i] != '\r':
        if initdata[0][i] == ',':
            ndims += 1
        i += 1
        
    data = np.empty([len(initdata), ndims])
    onedata = np.empty([ndims])
    
    for i in range(0, len(initdata)):
        j = 0
        dimused = 0
        while initdata[i][j] != '\r':
            if initdata[i][j] == ',':
                onedata[dimused] = float(temp)
                dimused += 1
                temp = ''
            else:
                temp += initdata[i][j]
            j += 1
        onedata[dimused] = float(temp)
        data[i] = onedata
        temp = ''
        
    return data

In [225]:
obs = getinput('output.csv')
numclusters = 72
centers = choosecenters(obs, numclusters)
start = time.time()
centers, membership, numdataperclust = kmeans2(obs, numclusters, 0.1, maxiteration=2, centers=centers)
print 'Run Time: ', time.time() - start
output(obs, centers, membership, numdataperclust)

[[  954.34918886  1616.28975429  1742.74035722]
 [  944.08756435   200.40765426   868.81998066]
 [ 1628.78670172  1080.69843313  1927.38235636]
 [ 1916.62372942   371.23501918   246.02247928]
 [ 1624.65668473   376.39629324  1996.01748765]
 [ 1194.91032978  1184.30482786   839.19832907]
 [ 1728.63267339   489.84178081   647.91000252]
 [ 1820.44414482  1067.93056313  1361.42388251]
 [ 1103.17891512  1418.91374256  1093.97177371]
 [ 1334.05978042  1938.15137777   113.1721988 ]
 [  547.04988181  1732.70280665   624.10639161]
 [ 1281.14460935  1209.32007378  1447.7529722 ]
 [ 1967.86471604    73.74014598    42.90987202]
 [ 1121.07956996    25.01450706  1483.49674845]
 [ 1670.57335409  1757.56944613  1446.33653541]
 [  564.13014859  1511.22176162  1237.05019206]
 [  486.47408603  1462.9582721    234.57085386]
 [  959.20398888  1742.51322627   531.54568368]
 [ 1612.53081778  1100.51722745    29.74029548]
 [  626.02169224   235.81206081   467.20622683]
 [ 1813.75873399  1145.80694524   859.74

In [200]:
data = getinput('output.csv')
centers = np.empty([72,3])
stride = 72000 // 72
counter = 0
for i in range(0, 72000, stride):
    centers[counter] = data[i]
    counter += 1
with open("initialcenters.csv", 'wb') as resultFile:
    wr = csv.writer(resultFile)
    wr.writerows(centers)

In [226]:
start = time.time()
obs = getinput('output.csv')
numclusters = 72
centers = choosecenters(obs, numclusters)
print time.time() - start

start = time.time()
kmeans = KMeans(n_clusters=numclusters, random_state=0,max_iter=1,init=centers).fit(obs)
end = time.time()
print 'Time: ', end - start
np.set_printoptions(threshold=np.nan)
x = kmeans.labels_
print x
#print kmeans.cluster_centers_

1.03621983528
Time:  0.0967919826508
[ 0 65 57 27  5 70  5 31 36 44 42 68 48 60 48 14 12  8 55 55  8 54 46 19 12
 56 55 64 69 45 55 21 21 39 46 68 55 55 55 48 62 55 15 48 46 45 55 68 36 15
 23 46 60 45 26 66 54 16 44 42  9 51 38 68 13 12 43 21 29 28 43 60 51 68 65
 51  5 38 52 14 11  0 14 60 38 35  5 46 62 34  2 44 20 44 55 54 19  4 20  5
 10 68 67 23 35 38  5 30 22 13 46 70 48 10 55 27  4 39 22 43 19  7 52 57 67
  5 31 56 34 31 59 14 48  5 48 38 10 58 39 13  8 10 20  1 52 52 55  1 46  2
 23 27  4 46 38 64 24  7 45 44 38 12 45 58 48 56 38 38 27 48 30 36 61 51 48
 53 10 30 48 68 26 10  5 68 45 26 27  2 43  4 70 35 33  7 46 50 58  5 10 46
 27 54  1  5 14 55 19 42 48 58 52 66 14 69 32 34 36 13 56 65 35 55 46 68 10
 19 62 60 10 45 45 19 65 46 14 43 42 55 14 46 48 35 55 38 52 35 48 38 53  6
 18 34  5 53 55 33 31 30  6  7 10 14 66 60 68 56 36 10 38  7 45 19 55 68  4
 48 65 59 24 66 62 38 56  4 11 26 58 48 48 13 48 33 55 10  2 21 46 31 65 34
  1 39 21 68 68 13 54 55 60 10 46 42 20 62 22 26 55

In [17]:
x = np.empty([1000])
start = time.time()
x = np.zeros([1000])
print time.time() - start
start = time.time()
for i in range(0,1000):
    x[i] = 0
print time.time() - start

7.58171081543e-05
0.000598192214966


In [17]:
x = np.ones([4])
l = [1]
l.append(x)
print l
with open("t.csv",'wb') as resultFile:
    wr = csv.writer(resultFile)
    wr.writerows([l])

[1, array([ 1.,  1.,  1.,  1.])]


In [56]:
centers2 = 1.1 * np.ones([2,3], dtype=float)
centers3 = np.ones([2.3],dtype=int)
print centers2
fd = open('kmeanstest','wb')
centers2.tofile(fd)
fd.close()

[[ 1.1  1.1  1.1]
 [ 1.1  1.1  1.1]]


In [51]:
from scipy.io.numpyio import fwrite
centers2 = 1.1 * np.ones([2,3], dtype=float)
fd = open('kmeanstest', 'wb')
fwrite(fd, centers2.size, centers2)
fd.close()

ImportError: No module named numpyio

In [60]:
floats = [1.1,2.2,3.3]
a = np.array(floats, 'float32')
fd = open('kmeanstest', 'wb')
a.tofile(fd)
fd.close()