In [1]:
from pyspark import SparkConf, SparkContext
import numpy as np

In [2]:
def pyspark_kmeans(data_txt, c_txt):
    """k-mean algotihm to cluster
    """
    # suppress large decimal floats
    np.set_printoptions(suppress=True)
    
    # set up spark
    conf = SparkConf()
    sc = SparkContext(conf=conf)
    
    # loading data and initial points
    data = sc.textFile(data_txt).map(lambda line: np.array([float(x) for x in line.split(' ')])).cache()
    centroids = sc.textFile(c_txt).map(lambda line: np.array([float(x) for x in line.split(' ')]))
    
    # adding index to data
    dat = data.zipWithIndex()

    max_iter = 20
    
    for _ in range(max_iter):
        # adding index to centroids
        cen = centroids.zipWithIndex()
        # adding every centoirds to data
        distance = dat.cartesian(cen)
        # map and calculate distance to each centroids
        dist = distance.map(lambda line: (line[0][1], (np.linalg.norm(line[0][0] - line[1][0]), line[0][0], line[1][1])))
        # reduce to get the minimum distance cluster
        dis = dist.reduceByKey(lambda v1, v2: min(v1, v2)).map(lambda line: (line[1][2], (line[1][1], 1)))
        # average cluster to get new centroids
        centroids = dis.reduceByKey(lambda (v1, v2), (x1, x2): (v1 + x1, v2 + x2)).map(lambda l: [l[1][0]/l[1][1]])
    lists = centroids.collect()
    
    # output to txt file
    txt = ''
    for i in lists:
        line = np.array2string(i[0], separator='' ,suppress_small=True)
        new_line = ' '.join(line.split())[2:-2] + '\n'
        txt += new_line
    new_txt = txt.rstrip('\n')
    text_file = open("Output1.txt", "w")
    text_file.write(new_txt)
    text_file.close()
    
    return new_txt

In [3]:
new_txt = pyspark_kmeans("datafile3.txt", "c1.txt")
print(new_txt)

0.2304 0.2156 0.47886667 0.38586667 0.4118 0.19693333 0.10193333 0.2278 0.349 0.5122 0.11673333 0.7602 0.1778 0.121 0.1824 0.284 0.25413333 0.26826667 2.12773333 0.25573333 1.48686667 0.4286 0.15606667 0.31173333 0.28386667 0.06773333 0.0158 0.0074 0.00306667 0.0148 0.00786667 0.0018 0.0406 0.0018 0.0172 0.0334 0.04833333 0.00846667 0.02786667 0.0374 0.00773333 0.0738 0.01793333 0.00933333 0.0524 0.03026667 0.00366667 0.00753333 0.07678667 0.13656667 0.01441333 0.79679333 0.23694667 0.08964 5.21684667 95.96 705.28 0.8933333
0.15384615 0.16606838 0.41940171 0.17136752 0.37290598 0.12410256 0.11495726 0.1642735 0.40991453 0.53538462 0.14871795 0.55504274 0.18675214 0.06136752 0.13205128 0.41376068 0.16333333 0.38452991 1.84410256 0.28418803 1.15042735 1.09649573 0.19051282 0.25726496 0.06991453 0.02957265 0.00205128 0.01179487 0.00205128 0.00247863 0.00136752 0. 0.01760684 0.00247863 0.04162393 0.03418803 0.04461538 0.00179487 0.01837607 0.04376068 0.0008547 0.0134188 0.00786325 0.008974