`Report Title:` k-Means Streaming and Clustering Queries </br>
`Date:` Oct 1, 2022 </br>
`Author:` Prashant Prasad Kanth </br>

In [8]:
import re
import os
import time
import shutil
from pyspark.context import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.clustering import StreamingKMeans

In [None]:
group_number = 4
dataset = 'dataset'
files = {'centers':'centers.txt',
         'data':'data.txt',
         'testdata':'testdata.txt'}
converted = 'dataset/converted'
outpath = 'results'

#### make one pass of "data.txt", "centers.txt", and "testdata.txt", and multiply `-1` on the **i-th** column in each file, where **i** is your group number `group_number:4`, hence updating column `3`, as index starts from `0`

In [None]:
# parse line and multiply column 3 by -1
def parse(line, group_number):
    col = 0
    vals = []
    for x in (re.sub("[\[\]]", "", line).split(",")):
        if col == group_number-1:
            vals.append(-1*float(x))
        else:
            vals.append(float(x))
        col += 1
    return vals

In [None]:
sc = SparkContext.getOrCreate()
os.makedirs(converted, exist_ok=True)
# process all files one by one
for file in files.values():
    # get path of file to be processes
    path = os.path.join(dataset,file)
    filename = path.split('/')[-1]
    newFolder = filename.split('.')[0]
    # get destination path for converted files
    outpath = os.path.join(converted, newFolder)
    data = sc.textFile(path)
    parsed = data.map(lambda line: parse(line, group_number))
    # write data in file
    parsed.coalesce(1).saveAsTextFile(outpath)
    source = os.path.join(outpath, 'part-00000')
    destination = os.path.join(converted, filename)
    # rename system generated file 'part-00000' and move to converted folder
    shutil.move(source, destination)
    # delete system generated folder by spark
    shutil.rmtree(outpath)

## Streaming K means clustering

In [None]:
def parser(line):
    return Vectors.dense(line[line.find('[') + 1: line.find(']')].split(','))

In [None]:
# write predictions in a file
def capturePredictions(rdd, testData, resPath, decay):
    os.makedirs(resPath, exist_ok=True)
    ccenters = rdd.collect()
    testDataSet = testData.collect()
    with open(f"{resPath}/Group4_HW2_task1_test_data_output_{decay}.txt", 'a+') as o:
        for c, d in zip(ccenters, testDataSet):
            o.write(f"({c}){d.tolist()}\n")

In [None]:
# write final centers and cost in a file
def writeFiles(model, testData, resPath, decay):
    os.makedirs(resPath, exist_ok=True)
    clusterCenters = model._model.clusterCenters

    with open(os.path.join(resPath, f'Group4_HW2_task1_cost.txt'),'a+') as c:
        # c.write(f'decayRate: {decay}\t')
        c.write(f'Cost = [{model._model.computeCost(testData)}]\n')
    
    with open(os.path.join(resPath, f'Group4_HW2_task1_cluster_centers.txt'),'a+') as ctr:
        # ctr.write(f'decayRate: {decay}\n')
        for i, center in enumerate(clusterCenters):
            ctr.write(f'({i}){center.tolist()}\n')

#### Approach 1: `queueStream`

In [None]:
# get spark context and streaming context
sc = SparkContext.getOrCreate()
ssc = StreamingContext(sc, 1)
resPath = 'dataset/results/queueStream'
decay = 0.75

# get initial center data into rdd
initCenters = sc.textFile(os.path.join(converted, files['centers'])).map(parser).collect()
k = len(initCenters)
# initialize weights for each center point
initWeights = [1. for i in range(k)]
# initialize StreamingKMeans model
model = StreamingKMeans(k=k, decayFactor=decay).setInitialCenters(centers=initCenters, weights=initWeights)

# get train and test data into rdd
trainData = sc.textFile(os.path.join(converted, files['data'])).map(parser)
testData = sc.textFile(os.path.join(converted, files['testdata'])).map(parser)

# create training windowed stream
trainingQueue = [trainData]
trainingStream = ssc.queueStream(trainingQueue)
trainingStreamWindow = trainingStream.window(10000)

# create test stream
testingQueue = [testData]
testingStream = ssc.queueStream(testingQueue)

# put model on training mode
model.trainOn(trainingStreamWindow)

# make predictions on test data after training
predictions = model.predictOn(testingStream)
predictions.foreachRDD(lambda rdd: capturePredictions(rdd, testData, resPath, decay))

# start streaming context
ssc.start()
ssc.awaitTerminationOrTimeout(10)
# stop streaming context after processing
ssc.stop(stopSparkContext=False, stopGraceFully=True)

# write final results to a file 
writeFiles(model, testData, resPath, decay)

print(f"Processed for decayFactor: {decay}")

In [4]:
# move files to outer directory path
for file in os.listdir(resPath):
    shutil.move(os.path.join(resPath, file), os.path.join(outpath, 'queueStream', file))
shutil.rmtree(resPath)

#### Approach 2: `textFileStream`

#### Start writing new files for streaming

In [None]:
# write a new file containing 10000 data points at a interval of 1 second
def writeFileStream(readStreamPath):
    os.makedirs(readStreamPath, exist_ok=True)
    fpath = readStreamPath.split('/data')[0]
    with open(os.path.join(fpath, 'data.txt'), 'r') as f:
        i = 0
        count = 0
        filecount = 0
        s = []
        for line in f:
            i += 1
            s.append(line)
            if i%10000 == 0:
                with open(f"{readStreamPath}/data-{filecount}.txt", 'w') as g:
                    for l in s:
                        g.write(l)
                count += i
                filecount += 1
                i = 0
                s = []
                time.sleep(1)

In [None]:
readStreamPath = 'dataset/converted/data'
resPath = 'dataset/results/textFileStream'
sc = SparkContext.getOrCreate()
initCenters = sc.textFile(os.path.join(converted, files['centers'])).map(parser).collect()
k = len(initCenters)
initWeights = [1. for i in range(k)]
trainData = sc.textFile(os.path.join(converted, files['data'])).map(parser)
testData = sc.textFile(os.path.join(converted, files['testdata'])).map(parser)

# run streaming k-means for different decay factors (1.0, 0.75, 0.5, 0.25, 0.0)
for d in range(16, -1, -4):
    ssc = StreamingContext(sc, 1)
    initCenters = sc.textFile(os.path.join(converted, files['centers'])).map(parser).collect()
    k = len(initCenters)
    initWeights = [1. for i in range(k)]
    decay = d/16
    
    model = StreamingKMeans(k=k, decayFactor=decay).setInitialCenters(centers=initCenters, weights=initWeights)

    fileStream = ssc.textFileStream(readStreamPath)
    trainingStream = fileStream.map(lambda row: Vectors.parse(row))
    
    model.trainOn(trainingStream)
    
    testDataStream = ssc.queueStream([testData])
    predictions = model.predictOn(testDataStream)
    predictions.foreachRDD(lambda rdd: capturePredictions(rdd, testData, resPath, decay))
    
    # start streaming context
    ssc.start()
    # start writing files in streaming fashion
    writeFileStream(readStreamPath)
    ssc.awaitTerminationOrTimeout(5)
    ssc.stop(stopSparkContext=False, stopGraceFully=True)
    # remove folder containing streaming data after processing
    shutil.rmtree(readStreamPath)
    
    # write files to results folder
    writeFiles(model, testData, resPath, decay)
    print(f"Processed for decayFactor: {decay}")

In [9]:
# move files to outer directory path
for file in os.listdir(resPath):
    shutil.move(os.path.join(resPath, file), os.path.join(outpath, 'textFileStream', file))
shutil.rmtree(resPath)