In [164]:
import numpy as np
class Point:
    def __init__(self, label, coordinates):
        self.label = label
        self.coordinates = np.array(coordinates)
        
    def getDist(self, cur_point):
        return np.linalg.norm(cur_point.coordinates - self.coordinates)

In [165]:
import csv
def getPoints(file_name):
    with open (file_name, 'r') as file_obj:
        points = []
        reader = csv.DictReader(file_obj, delimiter=',')
        for row in reader:
            coordinates = []
            label = row['label']
            for col in reader.fieldnames:
                if (col != 'label'):
                    coordinates.append(float(row[col]))
            points.append(Point(label, coordinates))
    return points

In [166]:
def getLabel(points):
    labels = {}
    max_count = 0;
    for point in points:
        if (point.label not in labels):
            labels[point.label] = 1
        else:
            labels[point.label] += 1
        if (labels[point.label] > max_count):
            max_count = labels[point.label]
            label_with_max_cout = point.label
    return label_with_max_cout

In [167]:
def purity(colors, centers):
    count = 0.0;
    summ = 0.0;
    
    for i in range(len(centers)):
        summ += len(colors[i])
        max_label = getLabel(colors[i])
        for point in colors[i]:
            if (point.label == max_label):
                count += 1;
            
    return count / summ

In [168]:
def getMins(points):
    minX = points[0].coordinates[0];
    minY = points[0].coordinates[1]
    for point in points:
        if (point.coordinates[0] < minX):
            minX = point.coordinates[0]
        if (point.coordinates[1] < minY):
            minY = point.coordinates[1]
    return minX, minY

In [169]:
def getMaxs(points):
    maxX = points[0].coordinates[0];
    maxY = points[0].coordinates[1]
    for point in points:
        if (point.coordinates[0] > maxX):
            maxX = point.coordinates[0]
        if (point.coordinates[1] > maxY):
            maxY = point.coordinates[1]
    return maxX, maxY

In [170]:
import numpy as np
def randomCenters(cluster_count):
    minX, minY = getMins(points)
    maxX, maxY = getMaxs(points)
    centers = []
    for i in range(cluster_count):
        centers.append([np.random.uniform(minX, maxX),np.random.uniform(minY, maxY) ])
    return centers

In [171]:
def updateCenters(colors, cluster_count):
    centers = [-1] * cluster_count
    for i in range(cluster_count):
        sumX = 0.0
        sumY = 0.0
        minY = colors[i][0].coordinates[1]
        maxY = colors[i][0].coordinates[1]
        for point in colors[i]:
            sumX += point.coordinates[0]
            sumY += point.coordinates[1]
        sumX = sumX / len(colors[i])
        sumY = sumY / len(colors[i])
        centers[i] = [sumX, sumY]
    return centers

In [172]:
def distance(point1, point2):
    return ((point1[0] - point2[0])**2 + (point1[1] - point2[1])**2 ) ** 0.5

In [173]:
def getMinCenter(point, centers):
    minDist = distance(point.coordinates, centers[0])
    ind = 0
    for i in range(len(centers)):
        dist = distance(point.coordinates, centers[i])
        if ( dist < minDist):
            minDist = dist
            ind = i
    return ind

In [174]:
def setColors(points, centers):
    colors = []
    for i in range(len(centers)):
        colors.append([])
    for point in points:
        indMinCenter = getMinCenter(point, centers)
        colors[indMinCenter].append(point)
    return colors

In [175]:
def kMeans(points, cluster_count):
    centers = randomCenters(cluster_count)
    while(True):
        colors = setColors(points, centers)
        newcenters = updateCenters(colors, cluster_count)
        if (newcenters == centers):
            return purity(colors, centers)
        else:
            centers = newcenters
            

In [181]:

for i in [2, 3, 5, 10]:
    points = getPoints('datasets/cancer.csv')
    print(kMeans(points, i))

0.8330404217926186
0.8400702987697716
0.8541300527240774
0.8787346221441125
