In [1]:
import random
import math
import matplotlib.pyplot as plt
import numpy as np
from mpl_toolkits.mplot3d import axes3d
import csv
import sys
import time
from array import array

In [2]:
"""maxs will hold the maximums of each dimension. mins will hold the minimums
of each dimension levels contains the number of subclsuters, and is
automatically set to 2. numclust will contain the number of clusters and
subclusters. dim will contain the number of dimensions."""
maxs = []
mins = []
levels = 0
numclust = []
dim = 0

In [19]:
"""clustergenexec is the execute function that will output the requested number
of data, represented by lists, grouped into clusters, represented by lists of
lists of real numbers. It can take in all user inputs. The first input (numdata)
is the number of data that you want, which is an integer. The second input
(numcluster) is a list of the number of clusters, that you want at each level.
For example, an input of [3,2,2] means that there will be 3 cluster, each of
which will have two subclusters, and each subcluster will have 2 subsubclusters.
The third input (dimensions) is an integer representing the number of dimensions.
The fourth (upperrange) is a list of the maximum of each dimension. The fifth 
(lowerrange) is an optional input, a list of the minimum of each dimension, and
is automatically made the origin if no input is given. The sixth (mindist) is an
optional input, the minimum distance between the edges of two clusters, a real
number, initialized to 10% of the average range of each dimension. The seventh
input (lvls) is an optional input, the number of levels of sub-clustering you
want, an integer, initialized to 2 if no other input is given. The eight input
(radius) is an optional input, a real number determining the size of the clusters,
initialized to 10% of the average range of each dimension if no other input is
given."""
def clustergenexec(numdata, numcluster, dimensions, upperrange,
                   lowerrange=[0], mindist=-1, lvls = 2, radius=-1):
    random.seed(0)
    start1 = time.time()
    global maxs
    global mins
    global levels
    global numclust
    global dim

    """Set the global list of maximums to the input list of maximums. Sets the global
    integer determining the levels of subclustering to user input. Sets the global list
    of number of clusters and subclusters to user input. Sets the global integer
    determining the number of dimensions to the user input"""
    maxs = upperrange
    mins = lowerrange
    levels = lvls
    numclust = numcluster
    dim = dimensions

    """centers will contain a list of centers of clusters. data will contain the data
    poitns generated. num is set to the number of highest-level clusters. size will
    contain half the average value of the dimensions."""
    centers = []
    data = []
    num = numclust[0]
    size = 0
    
    """If there is no input for the lower range, then set the lower range to be the origin.
    If there is no input for the radius, then set the radius to 10% of the average range of
    each dimension."""
    if lowerrange == [0]:
        for i in range(0, dim - 1):
            lowerrange.append(0) 
    
    """Runs a for loop to accumulate the range of each dimension and divides by 2 to get
    an approximate size of the space. If no user input was given for mindist, then make
    mindist 10% of the average range of each dimension. If there was no user input for
    radius, make radius 10% of the average range of each dimension newmin is the new
    minimum distance between the two cluster centers at the next level down."""
    for i in range(0, dim):
        size += (maxs[i] - mins[i]) / 2.0
    if mindist == -1:
        mindist = 0.01 * (size / dim)
    if radius == -1:
        radius = 0.05 * (size / dim)
    mindist2 = mindist ** 2
    newmin = mindist * radius / (size / dim)
    newrad = radius ** 2 / (size /dim)
    
    """temp will hold a candidate center. distance will hold the square of the distance
    between the candidate center and every other center before it is compared to
    mindist."""
    i = 0
    temp = []
    distance = 0
    
    end = time.time()
    print "Variable Initialization Time: ", end - start1

    """runs a while loop to generate num centers of clusters. First a random number
    generator is ran to randomly generate each elemenet of the coordinate, then checks
    to see if the candidate coordinate is within mindist of any of the other centers. If
    it is not, then append the candidate center onto the list of centers and increment
    the number of centers generated"""
    start = time.time()
    distthreshold = (mindist + radius * 2) ** 2
    while i < num:
        tooclose = False
        temp = []
        for j in range(0, dim):
            temp.append(random.random() * (maxs[j] - mins[j]) + mins[j])
        for j in range(0, len(centers) - 1):
            distance = 0
            for k in range(0, dim):
                distance += (centers[j][k] - temp[k]) ** 2
            if distance < distthreshold:
                tooclose = True
        if not tooclose:
            centers.append(temp)
            i += 1
    end = time.time()
    print "Cluster Center Generation Time: ", end - start
    
    """numpercluster is the number of data per cluster. If there are subclusters, for each
    cluster, run clustergen to create subclusters and append the outputs to the master
    list of data. If there are no subclusters, run clusterfromcenter num times to create
    the requisite number of clusters."""
    start = time.time()
    numperclust = numdata // num
    if levels > 0:
        for i in range(0, num):
            data += clustergen(numperclust,
                                   radius,
                                   newmin,
                                   levels - 1,
                                   centers[i],
                                   newrad,
                                   i)
    elif levels == 0:
        for i in range(0, num):
                data += clusterfromcenter(centers[i],
                                              numperclust,
                                              radius,
                                              i)
    end = time.time()
    print "Cluster Generation Time: ", end - start
    
    start = time.time()
    if dim == 2:
        displayscatterplot2D(data)
    if dim == 3:
        displayscatterplot3D(data)
    end = time.time()
    graph = end - start
    print "Display as Graph Time: ", graph
 
    start = time.time()
    data = dataperturb(data)
    end = time.time()
    print "Perturb Time: ", end - start

    start = time.time()
    datatocsv(data)
    end = time.time()
    print "Output as CSV File Time: ", end - start
    
    print "Total Time: ", end - start1
    print "Total Time without graph: ", end - start1 - graph
    
    print len(data)
    return data

In [4]:
"""clustergen is a function that outputs all levels of subclustering
in the form of a list of data, which are represented as lists of real
numbers. The first input (numdata) is the number of data, which is an
integer. The second input (radius) is the radius of the cluster in which
you want to make subclusters, a real number. The third (mindist) is the
minimum distance between the centers of two clusters, a real number. The
fourth input (levels) is the number of levels of subclustering that
remains to be created. The fifth input (center) is the center of the
larger cluster in which you want to create subclusters, a list of real
numbers."""
def clustergen(numdata, size, mindist, levels, center, radius, clusternumber):
    """Set num to the number of clusters at this level. temp will hold
    the data points. newmin is the new minimum distance between two centers
    that is proportional to the new radius. numperclust is the number of
    data per cluster"""
    global numclust
    num = numclust[len(numclust) - levels - 1]
    temp = []
    newmin =  mindist * radius / size
    newrad = radius ** 2 / size
    numperclust = numdata // num

    """If this is the final level of subclustering, call clusterhelper to
    generate the final level. If this isn't the final level, create a list
    of centers and run itself num times to create another level of 
    subclusters and store the data in temp."""
    if levels == 0:
        return clusterhelper(numdata,
                             num,
                             size,
                             mindist,
                             center,
                             radius,
                             clusternumber)
    elif levels != 0:
        centers = centergen(num,
                            radius,
                            mindist,
                            center,
                            newrad)
        for i in range(0, num):
            temp += clustergen(numperclust,
                               radius,
                               newmin,
                               levels - 1,
                               centers[i],
                               newrad,
                               clusternumber)
        return temp


In [5]:
"""clusterhelper creates the lowest level of subclusters, output as a list of
lists of real numbers, where each list represents a data point. The first
input (numdata) is the number of data, an integer. The second input
(numcluster) is the number of clusters, also an integer. The third input
(radius) is the radius of the larger cluster in which you wish to create
subclusters, a real number. The fourth input (mindist) is the minimum distance
between the centers of two clusters, a real number. The fifth input (center)
is the center of the larger cluster in which you wish to create subclusters, a
list of real numbers."""
def clusterhelper(numdata, numcluster, size, mindist, center, radius, clusternumber):
    """A list of centers within radius of center is generated and stored in
    centers. numperclust is the number of data per cluster. clusters will
    hold the data points."""
    centers = centergen(numcluster,
                        size,
                        mindist,
                        center,
                        radius)
    numperclust = numdata // numcluster
    mindistdiv2 = mindist / 2.0
    clusters = []

    """Runs a for loop to generate numcluster clusters. Calls clusterfromcenter
    to create a cluster centered at each element of centers within half of mindist."""
    for i in range(0, numcluster):
        clusters += clusterfromcenter(centers[i],
                                      numperclust,
                                      mindistdiv2,
                                      clusternumber)

    return clusters

In [6]:
"""clusterfromcenter creates a cluster of data centered around a given center,
and within the bounds defined by the user. It outputs a list of lists of real
numbers, where each list represents a data point. The first input (center) is
the center of the cluster you wish to create, a list. The second input
(numdata) is the number of data you want in this cluster, an integer. The third
input (radius) is the radius of the cluster you wish to create, a real number."""
def clusterfromcenter(center, numdata, radius, clusternumber):
    """data will hold the data points that make up this cluster. temp will hold
    a candidate data point. inbounds is a boolean that represents whether or not
    the candidate is larger than or equal to zero and less than or equal to the
    user defined maximum."""
    data = []
    global maxs
    global mins
    i = 0
    temp = []
    inbounds = True

    """Runs a while loop to generate numdata data points. Resets inbounds to be
    True, then generates a candidate coordinate with 99.7% chance of being within
    radius of the center of the cluster and stores it in temp."""
    while i < numdata:
        inbounds = True
        temp = gencoord(radius / 3)

        """Runs a for loop to transform the candidate coordinate to be it's true
        coordinates, instead of relative to the center of the cluster. Then checks
        to see if it is within the bounds, and if it isn't set inbounds to False.
        Then, if the candidate is within the bounds, it is appended to the end of 
        the data list and the number of data points generated is incremented."""
        for j in range(0, dim):
            temp[j] += center[j]
            if temp[j] > maxs[j] or temp[j] < mins[j]:
                inbounds = False
        if inbounds:
            temp.append(clusternumber)
            data.append(temp)
            i += 1

    return data

In [7]:
"""centergen creates a list of centers within a certain distance of a reference
point, and a certain distance away from other centers. It outputs a list of
lists of real numbers, each representing a coordinate. The first input (numcenter)
is the number of centers you want, an integer. The second input (radius) is the
maximum distance away from the reference point you want the centers to be, a real
number. The third input (mindist) is the minimum distance you want between two
centers, a real number. The fourth input (refpoint) is the point to which all the
centers created are centered around, a list of real numbers."""
def centergen(numcenter, size, mindist, refpoint, radius):
    """centers will contain the valid centers that are generated. temp will contain
    the candidate center while the function checks if it is valid. i is the number
    of centers that have been generated. inbounds is a boolean that records whether
    or not the candidate center is inbounds."""
    global dim
    global maxs
    global mins
    centers = []
    temp = []
    i = 0
    inbounds = True
    distthreshold = (mindist + radius * 2) ** 2

    """Runs a while loop to generate numcenter centers. tooclose is a boolean that
    records whether or not the candidate center is within mindist of another center.
    temp is set to a randomly generated coordinate that has a 99.7% chance of being
    within the input radius. inbounds is reset to true."""
    while i < numcenter:
        tooclose = False
        inbounds = True
        temp = gencoord(size/3)
        
        """Runs a for loop that transforms the generated coordinate from relative to
        the reference point to its true values, then checks to see if it is smaller
        than 0 or larger than the user determined maximum"""
        for j in range(0, dim):
            temp[j] += refpoint[j]
            if temp[j] > maxs[j] or temp[j] < mins[j]:
                inbounds = False

        """If the candidate is inbounds, then run a loop that checks the distance
        between the candidate center and all the other centers, and if the candidate
        center is too close to one of them, tooclose is set to True. Then, if the
        candidate center is not too close, append the candidate onto the list of
        generated centers, and increment the number of centers generated."""
        if  inbounds:
            for j in range(0, i):
                distance = 0
                for k in range(0, dim):
                    distance += (centers[j][k] - temp[k]) ** 2
                if distance < distthreshold:
                    tooclose = True
            if not tooclose:
                centers.append(temp)
                i += 1

    return centers

In [8]:
"""gencoord generates one coordinate within a certain distance of the origin.
The only input (radius) the the maximum distance you want your coordinate to
to be from the origin, a real number. The output is a list of real numbers."""
def gencoord(radius):
    """temp is going to contain a new coordinate. angle will temporarily hold
    a random angle. anglecos is a list that will contain the cosine of every
    randomly generated angle. anglesin is a list that will contain the sine of
    every randomly generated angle."""
    global dim
    temp = []
    angle = 0
    anglecos = []
    anglesin = []

    """r is a randomly generated radius distributed normally. The for loop
    randomly generates an angle, from 1 to pi, then inputs the sine and cosine
    of that angle into the respective lists. The final portion generates a
    random angle from 1 to 2 pi, then inputs the sine and cosine of that angle
    into their respective lists."""
    r = random.gauss(0, 1) * radius
    for j in range(0, dim - 2):
        angle = random.random() * math.pi
        anglesin.append(math.sin(angle))
        anglecos.append(math.cos(angle))
    angle = random.random() * 2 * math.pi
    anglesin.append(math.sin(angle))
    anglecos.append(math.cos(angle))
    
    """This loop converts spherical to rectangular coordinates using the formula
    x(n) = r * sin(theta(1)) * ... * sin(theta(n-1)) * cos(theta(n)) and the
    final coordinate is y = r * sin(theta(1)) * ... * sin(theta(n)). Then it
    inputs the rectangular coordinates into temp."""
    for j in range(0, dim - 1):
        temp.append(r)
        for k in range(0,j):
            temp[j] *= anglesin[k]
        temp[j] *= anglecos[j]
    temp.append(r * anglesin[dim - 2])
    for j in range(0, dim - 1):
        temp[dim - 1] *= anglesin[j]

    return temp

In [9]:
def dataperturb(data):
    newdata = []
    length = len(data)
    
    for i in range(0, length):
        newdata.append(data[random.randint(0, len(data) - 1)])
        
    return newdata

In [10]:
"""displayscatterplot2D will display any two dimensional data as a
scatter plot. The output will be a scatter plot. The one input is a list
of data points represented as lists of 2 real numbers."""
def displayscatterplot2D(data):
    """Creates two lists to store the x coordinates and the y
    coordinates then splits the data into x and y coordinates and puts
    them into the correct list."""
    x = []
    y = []
    for i in range(0, len(data)):
        x.append(data[i][0])
        y.append(data[i][1])

    """Determines the area of the window and the colors, which will be
    black."""
    area = np.pi*3
    colors = (0,0,0)
    
    """Creates and displays the scatter plot."""
    plt.scatter(x, y, s=area, c=colors, alpha=0.5)
    plt.title('Scatter Plot of Synthetically Generated Clustered Data Points')
    plt.xlabel('x')
    plt.ylabel('y')
    plt.show()

In [11]:
"""displayscatterplot3D will display any three dimensional data as a
scatter plot. The output will be a scatter plot. The one input is a list
of data points represented as lists of 3 real numbers."""
def displayscatterplot3D(data):
    """Sets some stuff that we need to display the scatterplot."""
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1, axisbg="1.0")
    ax = fig.gca(projection='3d')
    """Creates three lists to store the x, y, and z coordinates. Then
    loops through and splits up the data by their x, y, and z
    coordinates and puts them into the correct list."""
    x = []
    y = []
    z = []
    for i in range(0, len(data)):
        x.append(data[i][0])
        y.append(data[i][1])
        z.append(data[i][2])
    
    """Creates and displays the scatter plot."""
    ax.scatter(x, y, z, alpha=0.8, c="red", edgecolors='none',
               s=30, label="data") 
    plt.title('3D Scatter Plot of Synthetically Generated Clustered Data Points')
    plt.legend(loc=3)
    plt.show()

In [12]:
def datatocsv(data):
    global dim
    global numclust

    writ = []
    writ2 = []
    j = 0
    temp = 0
    
    while j < len(data):
        tempo = []
        for k in range(0, dim):
            tempo.append(data[j][k])
        writ.append(tempo)
        writ2.append([data[j][dim]])
        j += 1
            
    with open("labels.csv", 'wb') as resultFile:
        wr = csv.writer(resultFile)
        wr.writerows(writ2)
    
    with open("output.csv",'wb') as resultFile:
        wr = csv.writer(resultFile)
        wr.writerows(writ)
    
    '''for i in range(0, len(writ)):
        newarray = array('d', writ[i])
        print newarray
        newFileByteArray = bytearray(newarray)
        newFile.write(newFileByteArray)'''

In [13]:
def main():
    args = sys.argv

    args[2] = list(args[2])
    args[4] = list(args[4])
    numclusters = []
    maximums = []
    minimums = []
    temp = ''

    for i in range(0, len(args[2])):
        if args[2][i]!='[' and args[2][i]!=']' and args[2][i]!=',':
            temp += args[2][i]
        if args[2][i] == ',' or args[2][i] == ']':
            numclusters.append(int(temp))
            temp = ''
    
    for i in range(0, len(args[4])):
        if args[4][i]!='[' and args[4][i]!=']' and args[4][i]!=',':
            temp += args[4][i]
        if args[4][i] == ',' or args[4][i] == ']':
            maximums.append(float(temp))
            temp = ''

    if len(args) == 5:
            l2 = clustergenexec(int(args[1]),
                                numclusters,
                                int(args[3]),
                                maximums)
    elif len(args) == 6:
        args[5] = list(args[5])
        for i in range(0, len(args[5])):
            if args[5][i]!='[' and args[5][i]!=']' and args[5][i]!=',':
                temp += args[5][i]
            if args[5][i] == ',' or args[5][i] == ']':
                minimums.append(float(temp))
                temp = ''
        l2 = clustergenexec(int(args[1]),
                            numclusters,
                            int(args[3]),
                            maximums,
                            minimums)
    elif len(args) == 7:
        args[5] = list(args[5])
        for i in range(0, len(args[5])):
            if args[5][i]!='[' and args[5][i]!=']' and args[5][i]!=',':
                temp += args[5][i]
            if args[5][i] == ',' or args[5][i] == ']':
                minimums.append(float(temp))
                temp = ''
        l2 = clustergenexec(int(args[1]),
                            numclusters,
                            int(args[3]),
                            maximums,
                            minimums,
                            float(args[6]))
    elif len(args) == 8:
        args[5] = list(args[5])
        for i in range(0, len(args[5])):
            if args[5][i]!='[' and args[5][i]!=']' and args[5][i]!=',':
                temp += args[5][i]
            if args[5][i] == ',' or args[5][i] == ']':
                minimums.append(float(temp))
                temp = ''
        l2 = clustergenexec(int(args[1]),
                            numclusters,
                            int(args[3]),
                            maximums,
                            minimums,
                            float(args[6]),
                            int(args[7]))
    elif len(args) == 9:
        args[5] = list(args[5])
        for i in range(0, len(args[5])):
            if args[5][i]!='[' and args[5][i]!=']' and args[5][i]!=',':
                temp += args[5][i]
            if args[5][i] == ',' or args[5][i] == ']':
                minimums.append(float(temp))
                temp = ''
        l2 = clustergenexec(int(args[1]),
                            numclusters,
                            int(args[3]),
                            maximums,
                            minimums,
                            float(args[6]),
                            int(args[7]),
                            float(args[8]))
    else:
        print "Incorrect number of arguments must have >= 4 and <= 8."

In [21]:
"""dim = 5
print "This is a test coordinate: ", gencoord(5), "\n"
dim = 3
print "This is a test coordinate: ", gencoord(3), "\n"
dim = 10
print "This is a test coordinate: ", gencoord(10), "\n"
dim = 2
print "This is a test coordinate: ", gencoord(0), "\n"
dim = 12
print "This is a test coordinate: ", gencoord(23), "\n\n"

dim = 5
max = [1000,1000,1000,1000,1000]
print "This is 3 test centers: \n", centergen(3, 100, 2, [500,500,500,500,500]), "\n"
dim = 3
max = [500,500,500]
print "This is 5 test centers: \n", centergen(5, 50, 1, [250,250,250]), "\n"
dim = 7
max = [100,100,100,100,100,100,100]
print "This is 2 test centers: \n", centergen(2, 20, 1, [70,70,70,70,70,70,70]), "\n"
dim = 2
max = [300,300]
print "This is 2 test centers: \n", centergen(2, 300, 1, [0,0]), "\n"

dim = 3
max = [1000,1000,1000]
print "This is a test cluster: \n", clusterfromcenter([500,500,500], 5, 20), "\n"
dim = 5
max = [500,500,500,500,500]
print "This is a test cluster: \n", clusterfromcenter([100,100,100,100,100], 4, 25), "\n"
dim = 2
max = [50,50]
print "This is a test cluster: \n", clusterfromcenter([0,0], 6, 10), "\n"

dim = 3
max = [1000,1000,1000]
print "This is a clusterhelper test: \n", clusterhelper(18, 6, 100, 5, [500,500,500]), "\n"
dim = 5
max = [500,500,500,500,500]
print "This is a clusterhelper test: \n", clusterhelper(12, 3, 100, 4, [100,100,100,100,100]), "\n"
dim = 2
max = [100,100]
print "This is a clusterhelper test: \n", clusterhelper(6, 2, 25, 2, [0,0]), "\n"

dim = 3
max = [1000,1000,1000]
numclust = [3,2]
print "This is a clustergen test: \n", clustergen(24, 200, 5, 1, [500,500,500]), "\n"
dim = 5
max = [2000, 2000, 2000, 2000, 2000]
numclust = [3,2,2]
l = clustergen(36, 500, 7, 2, [1000, 1000, 1000, 1000, 1000])
print "This is a clustergen test: \n", l, "\n"

dim = 2
max = [100,100]
numclust = [3,2,2]
l = clustergen(24, 50, 4, 2, [0,0])
print "This is a cluster
gen test: \n", l, "\n"
"""

l = clustergenexec(72000, [72], 3, [2000, 2000, 2000],lvls=0)
#print l
#print "This is a clustergenexec test: \n", l, "\n"

Variable Initialization Time:  1.12056732178e-05
Cluster Center Generation Time:  0.00382900238037
Cluster Generation Time:  0.564446926117
Display as Graph Time:  6.72316193581
Perturb Time:  0.127067089081
Output as CSV File Time:  0.75283908844
Total Time:  8.17185115814
Total Time without graph:  1.44868922234
72000


In [31]:
from array import array
output_file = open('file', 'wb')
float_array = array('d', [3.14, 2.7, 0.0, -1.0, 1.1])
float_array.tofile(output_file)
output_file.close()