# Getting the data

In [525]:
from pyspark import SparkContext, SparkConf
from math import sqrt

sc = SparkContext.getOrCreate()
sc.stop()

conf = SparkConf().setAppName("kmeans").setMaster("local[*]")
sc = SparkContext(conf=conf)

* Getting data from S-sets : https://cs.joensuu.fi/sipu/datasets/
* Using the s1 dataset

In [526]:
s1 = sc.textFile("../data/s1.txt")


def extract_split(x):
    splits = x.split('    ')
    return (int(splits[1]), int(splits[2]))

s1_map = s1.flatMap(lambda x: x.split('\n')).map(extract_split)



In [527]:
s1_map.takeSample(False, 20)

[(430243, 781020),
 (351564, 573155),
 (311433, 134432),
 (400902, 772920),
 (629000, 417990),
 (314552, 151035),
 (330335, 568221),
 (233284, 860085),
 (156430, 615117),
 (353991, 193997),
 (289423, 559134),
 (76253, 529394),
 (262846, 559590),
 (864374, 150149),
 (221994, 575146),
 (848974, 199427),
 (358483, 554925),
 (368200, 364844),
 (410336, 767856),
 (127202, 525948)]

# Kmeans ++ intitialization

In [528]:
cluster1_center = s1_map.takeSample(False, 1)[0]


data_length = s1_map.count()
print(data_length)
BUCKET_SIZE = 5000
BUCKET_NUMBER = int(data_length / BUCKET_SIZE)

print(BUCKET_NUMBER)

5000
1


In [529]:
from random import randint
s1_map = s1_map.map(lambda x:(randint(1,BUCKET_NUMBER),) + x)

In [530]:
s1_map = s1_map.map(lambda x : x + cluster1_center)
s1_map.takeSample(False,20)

[(1, 424089, 352640, 681049, 846975),
 (1, 442286, 755194, 681049, 846975),
 (1, 307616, 207295, 681049, 846975),
 (1, 144856, 551807, 681049, 846975),
 (1, 174672, 345849, 681049, 846975),
 (1, 626586, 415166, 681049, 846975),
 (1, 854463, 159977, 681049, 846975),
 (1, 683656, 842406, 681049, 846975),
 (1, 608711, 373665, 681049, 846975),
 (1, 419227, 779215, 681049, 846975),
 (1, 311433, 134432, 681049, 846975),
 (1, 299821, 866288, 681049, 846975),
 (1, 347638, 429109, 681049, 846975),
 (1, 830065, 739162, 681049, 846975),
 (1, 270233, 195701, 681049, 846975),
 (1, 351815, 574947, 681049, 846975),
 (1, 682510, 905902, 681049, 846975),
 (1, 489182, 773934, 681049, 846975),
 (1, 808440, 729985, 681049, 846975),
 (1, 680592, 858526, 681049, 846975)]

In [531]:
def compute_distance(xi_indexes, yi_indexes):
    def bar(row):
        sum = 0
        for i in range(len(xi_indexes)):
            sum += (row[yi_indexes[i]] - row[xi_indexes[i]]) ** 2
        distance = sqrt(sum)
        return row + (distance,)
    return bar


In [532]:
def reduceByKeyMax(dist_indexes):
    def reduce_custom(x1,x2):
        dist_x1 = []
        dist_x2 = []
        for idx in dist_indexes:
            dist_x1.append(x1[idx])
            dist_x2.append(x2[idx])
        
        mindist_x1 =  min(dist_x1)
        mindist_x2 = min(dist_x2)
        
        
        if(mindist_x1 > mindist_x2):
            return x1
        else:
            return x2
    return reduce_custom



In [533]:
def compute_average(list_reduce, coord_indexes):
    result = ()
    for idx in coord_indexes:
        coord_list = [reduce_tuple[1][idx] for reduce_tuple in list_reduce]
        average = sum(coord_list)/len(coord_list)
        result += (average,)
    return result
        

In [534]:
def initCluster(data,num_clusters,num_features):
    xi_indexes = [idx for idx in range(1,num_features+1)]
    yi_indexes = [idx for idx in range(num_features+1, num_features*2 + 1)]
    coord_indexes = [idx for idx in range(num_features)]
    dist_indexes = [num_features*2]
    current_clust = 2        
    for _ in range(num_clusters-1):
        
        
        data = data.map(compute_distance(xi_indexes, yi_indexes))
        
        reduce_tuple = data.map(lambda x:(x[0], x[1:]))
        
        reduce_tuple = reduce_tuple.reduceByKey(reduceByKeyMax(dist_indexes=dist_indexes))
        
        new_cluster_point = compute_average(reduce_tuple.collect(), coord_indexes=coord_indexes)
        
        print("Computing center of cluster n° {}".format(current_clust))
        print("New cluster point : {}".format(new_cluster_point))
        
        data = data.map(lambda x:(x + new_cluster_point))
        
        current_clust += 1
        dist_indexes.append(current_clust*num_features + current_clust - 2)
        yi_indexes = [old_value + num_features + 1 for old_value in yi_indexes]
        
        print("New yi indexes : {}".format(yi_indexes))
        print("New dist_indexes : {}".format(dist_indexes))
        print("\n")

    return data
        


In [535]:
s1_map = initCluster(s1_map, num_features=2,num_clusters=10)
s1_map.takeSample(withReplacement=False,num=100)

Computing center of cluster n° 2
New cluster point : (270729.0, 67329.0)
New yi indexes : [6, 7]
New dist_indexes : [4, 7]


Computing center of cluster n° 3
New cluster point : (925732.0, 210388.0)
New yi indexes : [9, 10]
New dist_indexes : [4, 7, 10]


Computing center of cluster n° 4
New cluster point : (94382.0, 320888.0)
New yi indexes : [12, 13]
New dist_indexes : [4, 7, 10, 13]


Computing center of cluster n° 5
New cluster point : (854996.0, 81095.0)
New yi indexes : [15, 16]
New dist_indexes : [4, 7, 10, 13, 16]


Computing center of cluster n° 6
New cluster point : (94382.0, 320888.0)
New yi indexes : [18, 19]
New dist_indexes : [4, 7, 10, 13, 16, 19]


Computing center of cluster n° 7
New cluster point : (854996.0, 81095.0)
New yi indexes : [21, 22]
New dist_indexes : [4, 7, 10, 13, 16, 19, 22]


Computing center of cluster n° 8
New cluster point : (94382.0, 320888.0)
New yi indexes : [24, 25]
New dist_indexes : [4, 7, 10, 13, 16, 19, 22, 25]


Computing center of cluster n

[(1,
  567565,
  573952,
  681049,
  846975,
  295669.03250932455,
  94382.0,
  320888.0,
  536603.7081357153,
  94382.0,
  320888.0,
  536603.7081357153,
  94382.0,
  320888.0,
  536603.7081357153,
  94382.0,
  320888.0,
  536603.7081357153,
  94382.0,
  320888.0,
  536603.7081357153,
  94382.0,
  320888.0,
  536603.7081357153,
  94382.0,
  320888.0,
  536603.7081357153,
  94382.0,
  320888.0,
  536603.7081357153,
  94382.0,
  320888.0),
 (1,
  798056,
  714380,
  681049,
  846975,
  176839.11352978446,
  94382.0,
  320888.0,
  806221.4660625205,
  94382.0,
  320888.0,
  806221.4660625205,
  94382.0,
  320888.0,
  806221.4660625205,
  94382.0,
  320888.0,
  806221.4660625205,
  94382.0,
  320888.0,
  806221.4660625205,
  94382.0,
  320888.0,
  806221.4660625205,
  94382.0,
  320888.0,
  806221.4660625205,
  94382.0,
  320888.0,
  806221.4660625205,
  94382.0,
  320888.0),
 (1,
  306540,
  193434,
  681049,
  846975,
  753241.5480853403,
  94382.0,
  320888.0,
  247498.56379381276,
  9

# Vieux brouillon de code en dessous

In [479]:
s1_map = s1_map.map(compute_distance([1,2],[3,4]))
s1_map.takeSample(False,20)

[(5,
  5,
  5,
  330145,
  546705,
  2,
  3,
  677769,
  841641,
  35,
  11,
  872506,
  157517,
  20,
  793019,
  764859,
  5,
  55359,
  623076,
  495388,
  156422,
  641398.064042136,
  862449.8,
  755020.2,
  817804.8858733238,
  862449.8,
  755020.2,
  817804.8858733238,
  862449.8,
  755020.2,
  817804.8858733238,
  862449.8,
  755020.2,
  817804.8858733238,
  862449.8,
  755020.2,
  817804.8858733238,
  862449.8,
  755020.2,
  817804.8858733238,
  862449.8,
  755020.2,
  817804.8858733238,
  862449.8,
  755020.2,
  817804.8858733238,
  862449.8,
  755020.2,
  817804.8858733238,
  862449.8,
  755020.2,
  817804.8858733238,
  862449.8,
  755020.2,
  817804.8858733238,
  862449.8,
  755020.2,
  817804.8858733238,
  862449.8,
  755020.2,
  817804.8858733238,
  862449.8,
  755020.2,
  1064077.7481913622,
  862449.8,
  755020.2,
  677332.8540163987,
  862449.8,
  755020.2,
  70124.4463370657,
  862449.8,
  755020.2,
  70124.4463370657,
  862449.8,
  755020.2,
  70124.4463370657,
  862

In [480]:

s1_reduce = s1_map.map(lambda x:(x[0], x[1:6]))
s1_reduce.takeSample(False,20)


[(1, (3, 5, 412227, 787450, 2)),
 (2, (1, 2, 201567, 881003, 2)),
 (1, (2, 1, 798321, 319665, 2)),
 (4, (5, 3, 197735, 355477, 2)),
 (5, (1, 3, 718088, 852181, 2)),
 (1, (1, 4, 864053, 139325, 2)),
 (1, (1, 3, 144242, 557674, 2)),
 (2, (1, 2, 608557, 590278, 2)),
 (5, (3, 5, 796786, 724615, 2)),
 (5, (2, 4, 878519, 606677, 2)),
 (5, (3, 1, 473485, 847098, 2)),
 (2, (2, 1, 876503, 659112, 2)),
 (4, (2, 4, 307747, 128551, 2)),
 (3, (2, 2, 450030, 169537, 2)),
 (4, (4, 3, 769876, 257669, 2)),
 (5, (3, 4, 541006, 175198, 2)),
 (3, (1, 2, 512291, 169608, 2)),
 (4, (4, 4, 268289, 878265, 2)),
 (5, (1, 2, 825414, 734466, 2)),
 (1, (1, 5, 914183, 620056, 2))]

In [481]:
s1_reduce = s1_reduce.reduceByKey(reduceByKeyMax(dist_indexes=[4]))
s1_reduce.collect()

[(2, (4, 1, 683321, 873932, 2)),
 (4, (5, 1, 684091, 842566, 2)),
 (1, (1, 1, 650661, 861267, 2)),
 (3, (2, 2, 599647, 858702, 2)),
 (5, (1, 2, 691827, 863963, 2))]

In [482]:
print(cluster1_center)
print (new_cluster_point)

(2, 3, 677769, 841641, 35, 11, 872506, 157517, 20, 793019, 764859, 5, 55359, 623076, 495388, 156422, 641398.064042136, 862449.8, 755020.2, 817804.8858733238, 862449.8, 755020.2, 817804.8858733238, 862449.8, 755020.2, 817804.8858733238, 862449.8, 755020.2, 817804.8858733238, 862449.8, 755020.2, 817804.8858733238, 862449.8, 755020.2, 817804.8858733238, 862449.8, 755020.2, 817804.8858733238, 862449.8, 755020.2, 817804.8858733238, 862449.8, 755020.2, 817804.8858733238, 862449.8, 755020.2, 817804.8858733238, 862449.8, 755020.2, 817804.8858733238, 862449.8, 755020.2, 817804.8858733238, 862449.8, 755020.2, 817804.8858733238, 862449.8, 755020.2, 1064077.7481913622, 862449.8, 755020.2, 677332.8540163987, 862449.8, 755020.2, 70124.4463370657, 862449.8, 755020.2, 70124.4463370657, 862449.8, 755020.2, 70124.4463370657, 862449.8, 755020.2, 70124.4463370657, 862449.8, 755020.2, 70124.4463370657, 862449.8, 755020.2, 70124.4463370657, 862449.8, 755020.2, 70124.4463370657, 862449.8, 755020.2, 70124.446

In [483]:
new_cluster_point = compute_average(s1_reduce.collect(), [0,1])
