# Getting the data

In [241]:
from pyspark import SparkContext, SparkConf
from math import sqrt

sc = SparkContext.getOrCreate()
sc.stop()

conf = SparkConf().setAppName("kmeans").setMaster("local[*]")
sc = SparkContext(conf=conf)

* Getting data from S-sets : https://cs.joensuu.fi/sipu/datasets/
* Using the s1 dataset

In [242]:
s1 = sc.textFile("../data/s1.txt")


def extract_split(x):
    splits = x.split('    ')
    return (int(splits[1]), int(splits[2]))

s1_map = s1.flatMap(lambda x: x.split('\n')).map(extract_split)



In [243]:
s1_map.takeSample(False, 20)

[(397764, 406712),
 (318215, 180010),
 (366654, 802105),
 (429220, 787348),
 (333787, 189423),
 (810817, 509540),
 (608923, 385575),
 (138199, 541774),
 (137451, 555339),
 (334182, 558498),
 (175586, 365049),
 (856619, 544008),
 (225567, 853078),
 (853554, 158012),
 (535564, 177248),
 (320628, 564596),
 (615239, 317684),
 (160165, 362858),
 (861033, 507671),
 (352209, 548608)]

# Kmeans ++ intitialization

In [244]:
cluster1_center = s1_map.takeSample(False, 1)[0]


data_length = s1_map.count()
print(data_length)
BUCKET_SIZE = 1000
BUCKET_NUMBER = int(data_length / BUCKET_SIZE)



5000


In [245]:
from random import randint
s1_map = s1_map.map(lambda x:(randint(1,BUCKET_NUMBER),) + x)

In [246]:
s1_map = s1_map.map(lambda x : x + cluster1_center)
s1_map.takeSample(False,20)

[(2, 658199, 568743, 351883, 890475),
 (5, 892423, 530999, 351883, 890475),
 (1, 677551, 860007, 351883, 890475),
 (5, 812152, 689907, 351883, 890475),
 (2, 299578, 572052, 351883, 890475),
 (5, 800959, 322309, 351883, 890475),
 (4, 857620, 709386, 351883, 890475),
 (1, 576235, 462612, 351883, 890475),
 (2, 80673, 566893, 351883, 890475),
 (1, 422961, 821595, 351883, 890475),
 (1, 598053, 438963, 351883, 890475),
 (2, 600679, 599545, 351883, 890475),
 (2, 497255, 188783, 351883, 890475),
 (5, 373339, 398565, 351883, 890475),
 (1, 242087, 844255, 351883, 890475),
 (1, 330533, 214555, 351883, 890475),
 (4, 604737, 574591, 351883, 890475),
 (5, 684131, 898331, 351883, 890475),
 (4, 850200, 164752, 351883, 890475),
 (1, 230591, 874855, 351883, 890475)]

In [247]:
def compute_distance(xi_indexes, yi_indexes):
    def bar(row):
        sum = 0
        for i in range(len(xi_indexes)):
            sum += (row[yi_indexes[i]] - row[xi_indexes[i]]) ** 2
        distance = sqrt(sum)
        return row + (distance,)
    return bar


In [248]:
s1_map = s1_map.map(compute_distance([1,2],[3,4]))
s1_map.takeSample(False,20)

[(1, 213064, 290947, 351883, 890475, 615389.7444262457),
 (2, 658103, 572250, 351883, 890475, 441630.8854971536),
 (4, 579564, 428498, 351883, 890475, 515035.32528361585),
 (2, 358830, 431609, 351883, 890475, 458918.5840266223),
 (3, 621537, 390372, 351883, 890475, 568169.244437782),
 (2, 807446, 342555, 351883, 890475, 712568.5745028334),
 (3, 920249, 551171, 351883, 890475, 661941.9267367795),
 (2, 796353, 710294, 351883, 890475, 479602.7248264964),
 (1, 618009, 867493, 351883, 890475, 267116.4918158368),
 (5, 875311, 542259, 351883, 890475, 628674.2032563449),
 (4, 821151, 366545, 351883, 890475, 703359.8657330399),
 (3, 147663, 555514, 351883, 890475, 392306.869581709),
 (4, 881789, 727577, 351883, 890475, 554379.0465376555),
 (4, 610722, 418243, 351883, 890475, 538517.1211252247),
 (2, 401708, 409924, 351883, 890475, 483127.0994531356),
 (4, 841292, 155620, 351883, 890475, 882911.6831858099),
 (3, 605911, 379990, 351883, 890475, 570197.4745726256),
 (3, 498730, 183037, 351883, 890

In [339]:
def reduceByKeyMax(dist_indexes):
    def reduce_custom(x1,x2):
        dist_x1 = []
        dist_x2 = []
        for idx in dist_indexes:
            dist_x1.append(x1[idx])
            dist_x2.append(x2[idx])
        
        mindist_x1 =  min(dist_x1)
        mindist_x2 = min(dist_x2)
        if(mindist_x1 > mindist_x2):
            return x1
        else:
            return x2
    return reduce_custom



In [340]:

s1_reduce = s1_map.map(lambda x:(x[0], x[1:6]))
s1_reduce.takeSample(False,20)


[(5, (101722, 535117, 351883, 890475, 434580.06636867276)),
 (1, (140121, 346875, 351883, 890475, 583390.1821628471)),
 (2, (308044, 164482, 351883, 890475, 727315.4019887109)),
 (4, (295537, 91032, 351883, 890475, 801426.2174180478)),
 (5, (599870, 591110, 351883, 890475, 388737.6433971889)),
 (5, (606809, 572101, 351883, 890475, 407859.37448096)),
 (4, (636291, 360636, 351883, 890475, 601346.2200637832)),
 (3, (604554, 393193, 351883, 890475, 557792.0954665815)),
 (1, (431117, 394746, 351883, 890475, 502021.183016215)),
 (3, (404782, 557913, 351883, 890475, 336742.91090533737)),
 (1, (825141, 535997, 351883, 890475, 591293.3172698639)),
 (2, (770113, 241615, 351883, 890475, 771968.673263365)),
 (5, (814561, 575281, 351883, 890475, 559837.6455009077)),
 (2, (192460, 388323, 351883, 890475, 526851.3301046131)),
 (5, (790774, 303642, 351883, 890475, 732801.6646883384)),
 (5, (236431, 844100, 351883, 890475, 124417.8641875836)),
 (2, (817410, 346944, 351883, 890475, 715640.5073009772)),


In [344]:
s1_reduce = s1_reduce.reduceByKey(reduceByKeyMax(dist_indexes=[4]))
s1_reduce.collect()

[(2, (854996, 81095, 351883, 890475, 953005.0761506992)),
 (4, (869167, 105039, 351883, 890475, 940474.585915005)),
 (1, (883864, 111281, 351883, 890475, 943476.0590481351)),
 (3, (881106, 105566, 351883, 890475, 946656.8132169123)),
 (5, (874672, 104851, 351883, 890475, 943670.1796162683))]

In [346]:
def compute_average(list_reduce, coord_indexes):
    result = ()
    for idx in coord_indexes:
        coord_list = [reduce_tuple[1][idx] for reduce_tuple in list_reduce]
        average = sum(coord_list)/len(coord_list)
        result += (average,)
    return result

new_cluster_point = compute_average(s1_reduce.collect(), [0,1])
        

In [348]:
print(cluster1_center)
print (new_cluster_point)

(351883, 890475)
(872761.0, 101566.4)
