# Getting the data

In [47]:
from pyspark import SparkContext, SparkConf
from math import sqrt,log
from random import randint, random


sc = SparkContext.getOrCreate()
sc.stop()

conf = SparkConf().setAppName("kmeans").setMaster("local[*]")
sc = SparkContext(conf=conf)

* Getting data from S-sets : https://cs.joensuu.fi/sipu/datasets/
* Using the s1 dataset

In [48]:
s1 = sc.textFile("../data/s1.txt")


def extract_split(x):
    splits = x.split('    ')
    return (int(splits[1]), int(splits[2]))

s1 = s1.flatMap(lambda x: x.split('\n')).map(extract_split)



In [49]:
s1.takeSample(False, 20)

[(479631, 178459),
 (615649, 429323),
 (197849, 575524),
 (832154, 559403),
 (668367, 855686),
 (700341, 869730),
 (746769, 205186),
 (863102, 545783),
 (646816, 829778),
 (251032, 876118),
 (533551, 582247),
 (391530, 398743),
 (346144, 560472),
 (155274, 552551),
 (559076, 190770),
 (789388, 712210),
 (353333, 113156),
 (894124, 202052),
 (858042, 178196),
 (614279, 599251)]

# Kmeans ++ intitialization

In [36]:
cluster1_center = s1.takeSample(False, 1)[0]

In [37]:
s1_map = s1.map(lambda x : x + cluster1_center)
s1_map.takeSample(False,20)

[(396517, 394572, 808686, 332209),
 (721905, 378957, 808686, 332209),
 (474652, 193224, 808686, 332209),
 (804119, 328544, 808686, 332209),
 (432656, 783776, 808686, 332209),
 (227819, 900986, 808686, 332209),
 (546565, 151576, 808686, 332209),
 (854769, 713941, 808686, 332209),
 (874544, 686096, 808686, 332209),
 (618676, 377015, 808686, 332209),
 (612648, 772280, 808686, 332209),
 (367048, 553451, 808686, 332209),
 (675551, 956849, 808686, 332209),
 (407429, 399155, 808686, 332209),
 (344096, 575522, 808686, 332209),
 (551291, 576721, 808686, 332209),
 (487472, 166999, 808686, 332209),
 (860384, 555572, 808686, 332209),
 (399291, 386712, 808686, 332209),
 (274495, 182470, 808686, 332209)]

In [38]:
def compute_distance(xi_indexes, yi_indexes):
    def bar(row):
        sum = 0
        for i in range(len(xi_indexes)):
            sum += (row[yi_indexes[i]] - row[xi_indexes[i]]) ** 2
        distance = sqrt(sum)
        return row + (distance,)
    return bar


In [39]:
def reduceMax(dist_indexes):
    def reduce_custom(x1,x2):
        dist_x1 = []
        dist_x2 = []
        for idx in dist_indexes:
            dist_x1.append(x1[idx])
            dist_x2.append(x2[idx])
        
        mindist_x1 =  min(dist_x1)
        mindist_x2 = min(dist_x2)
        
        
        if(mindist_x1 > mindist_x2):
            return x1
        else:
            return x2
        
        
        '''
        #Another version with more randomness => should converge less
        if(mindist_x1 != 0):
            drawx1 = log(mindist_x1) * random() 
        else:
            drawx1 = 0
        if(mindist_x2 != 0):
            drawx2 = log(mindist_x2) * random()
        else:
            drawx2 = 0
            
            
        if(drawx1 > drawx2):
            return x1
        else:
            return x2
        '''
        
        
    return reduce_custom



In [44]:
def reduceMaxWithRandom(dist_indexes):
    def reduce_custom(x1,x2):
        dist_x1 = []
        dist_x2 = []
        for idx in dist_indexes:
            dist_x1.append(x1[idx])
            dist_x2.append(x2[idx])
        
        mindist_x1 =  min(dist_x1)
        mindist_x2 = min(dist_x2)
        
        
        if(mindist_x1 != 0):
            drawx1 = log(mindist_x1) * random() 
        else:
            drawx1 = 0
        if(mindist_x2 != 0):
            drawx2 = log(mindist_x2) * random()
        else:
            drawx2 = 0
            
            
        if(drawx1 > drawx2):
            return x1
        else:
            return x2
        
            
    return reduce_custom

In [45]:
def compute_average(list_reduce, coord_indexes):
    result = ()
    for idx in coord_indexes:
        coord_list = [reduce_tuple[1][idx] for reduce_tuple in list_reduce]
        average = sum(coord_list)/len(coord_list)
        result += (average,)
    return result
        

In [55]:
def initCluster(data,num_clusters,num_features, reducer):
    xi_indexes = [idx for idx in range(0,num_features)]
    yi_indexes = [idx for idx in range(num_features, num_features*2)]
    coord_indexes = [idx for idx in range(num_features)]
    dist_indexes = [num_features*2]
    current_clust = 2
    
    '''
    print("Initial xi_indexes : ", xi_indexes)
    print("Initial yi_indexes : ", yi_indexes)
    print("Initial coord_indexes : ",coord_indexes)
    print("Initial dist_indexes : ", dist_indexes)
    print("Intial current_clust :", current_clust)
    print("\n")
    '''
    
    for _ in range(num_clusters-1):
        
        
        data = data.map(compute_distance(xi_indexes, yi_indexes))
        
        reduce_tuple = data.reduce(reducer(dist_indexes=dist_indexes))
                
        new_cluster_point = tuple(reduce_tuple[i] for i in coord_indexes)
        
        #print("Computing center of cluster n° {}".format(current_clust))
        print("New cluster point : {}".format(new_cluster_point))
        
        data = data.map(lambda x:(x + new_cluster_point))
        
        
        #Update of variables
        current_clust += 1
        dist_indexes.append(current_clust*num_features + current_clust - 2)
        yi_indexes = [old_value + num_features + 1 for old_value in yi_indexes]
        
        #print("New yi indexes : {}".format(yi_indexes))
        #print("New dist_indexes : {}".format(dist_indexes))
        print("\n")

    return data
        


In [42]:


s1_map = initCluster(s1_map, num_features=2,num_clusters=20)


Initial xi_indexes :  [0, 1]
Initial yi_indexes :  [2, 3]
Initial coord_indexes :  [0, 1]
Initial dist_indexes :  [4]
Intial current_clust : 2


Computing center of cluster n° 2
New cluster point : (139601, 914203)
New yi indexes : [5, 6]
New dist_indexes : [4, 7]


Computing center of cluster n° 3
New cluster point : (155625, 262321)
New yi indexes : [8, 9]
New dist_indexes : [4, 7, 10]


Computing center of cluster n° 4
New cluster point : (267049, 950249)
New yi indexes : [11, 12]
New dist_indexes : [4, 7, 10, 13]


Computing center of cluster n° 5
New cluster point : (143732, 279724)
New yi indexes : [14, 15]
New dist_indexes : [4, 7, 10, 13, 16]


Computing center of cluster n° 6
New cluster point : (267049, 950249)
New yi indexes : [17, 18]
New dist_indexes : [4, 7, 10, 13, 16, 19]


Computing center of cluster n° 7
New cluster point : (143732, 279724)
New yi indexes : [20, 21]
New dist_indexes : [4, 7, 10, 13, 16, 19, 22]


Computing center of cluster n° 8
New cluster point : (2

In [59]:
cluster1_center = s1.takeSample(False, 1)[0]
print("Initial cluster center : ",cluster1_center, "\n")
s1_map = s1.map(lambda x : x + cluster1_center)
s1_map_clustered = initCluster(s1_map, num_features=2,num_clusters=20, reducer=reduceMax)


Initial cluster center :  (326043, 553772) 

New cluster point : (895835, 123485)


New cluster point : (878690, 739021)


New cluster point : (854996, 81095)


New cluster point : (878690, 739021)


New cluster point : (854996, 81095)


New cluster point : (878690, 739021)


New cluster point : (854996, 81095)


New cluster point : (878690, 739021)


New cluster point : (854996, 81095)


New cluster point : (878690, 739021)


New cluster point : (854996, 81095)


New cluster point : (878690, 739021)


New cluster point : (854996, 81095)


New cluster point : (878690, 739021)


New cluster point : (854996, 81095)


New cluster point : (878690, 739021)


New cluster point : (854996, 81095)


New cluster point : (878690, 739021)


New cluster point : (854996, 81095)




In [None]:
print("Initial cluster center : ",a cluster1_center, "\n")
s1_map_clustered2 = initCluster(s1_map, anum_features=2,num_clusters=20, reducer=reduceMax)
