In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('KMA').getOrCreate()
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
import os
import time
import pandas as pd
import numpy as np

import boto3
from io import BytesIO
import sagemaker.amazon.common as smac
from sagemaker import get_execution_role

In [10]:
'''
-----------------------------------------------------------------------------
--- Inptuts
-----------------------------------------------------------------------------
'''
dataSource = 'L4'
Interp = 'Interp'
dataSetName = 'L4polarInterpDataScaled.csv'
Dimensions = 1
Coor = 'Polar'

clusterRange1 = range(2,101,1)
clusterRange2 = range(105,501,5)

In [11]:
'''
-----------------------------------------------------------------------------
--- Definitions
-----------------------------------------------------------------------------
'''
Results = 'Results'
Indicies = 'Indicies'
Analysis = 'KMA'
resultsPath = '{}/{}/{}/{}/{}/{}/'.format(dataSource,Coor,Interp,Analysis,Results,Dimensions)

# Configuring S3
s3_bucket_name = 'jasper-ml-sagemaker'
role = get_execution_role()

client = boto3.client('s3')
resource = boto3.resource('s3')
my_bucket = resource.Bucket(s3_bucket_name)

In [12]:
'''
-----------------------------------------------------------------------------
--- Importing and Vectorising Data
-----------------------------------------------------------------------------
'''

dataPath = 'Data/'
dataKey = dataSetName
obj = client.get_object(Bucket=s3_bucket_name, Key=dataPath + dataKey)
dataSet2 = pd.read_csv(obj['Body'], header = None, index_col=None)
dataSet2.to_csv(dataKey, index=False)
dataset = spark.read.csv(dataKey,inferSchema = True)

assembler = VectorAssembler(inputCols= dataset.columns,
                           outputCol = 'features')
final_data = assembler.transform(dataset)

'''
-----------------------------------------------------------------------------
--- Performing Clustering
-----------------------------------------------------------------------------
'''
collectedIndicies = [] 
for i in clusterRange1:
    #  Perform Kmeans on the data
    timestart = time.time()
    kmeans = KMeans(k=i, maxIter= 2000,featuresCol='features')
    kMeansModel = kmeans.fit(final_data)
    predictions = kMeansModel.transform(final_data).select('prediction').toPandas()

    # save at each stage to reduce memory consumption
    dataKey = '{}results.csv'.format(i)
    predictions.to_csv(dataKey, index=False)
    my_bucket.upload_file(dataKey,Key=resultsPath + dataKey)
    os.remove(dataKey)
    
    # Calculate Char Indicies
    profile2 = dataSet2.copy()
    profile2['labels'] = predictions.astype(np.float64)
    charProfile = profile2.groupby(['labels']).mean().values
    characteristicIndicies = []
    for j in range(i):
        characteristicIndicies.append(np.argmin(np.linalg.norm(dataSet2-charProfile[j],axis=1)))

    collectedIndicies.append(np.flip(characteristicIndicies,axis=0))
    
    print('KMA with {} centres took {} seconds'.format(i,time.time()-timestart))
    
for i in clusterRange2:
    timestart = time.time()
    kmeans = KMeans(k=i, maxIter= 2000,featuresCol='features')
    kMeansModel = kmeans.fit(final_data)
    predictions = kMeansModel.transform(final_data).select('prediction').toPandas()
    
    # save at each stage to reduce memory consumption
    dataKey = '{}results.csv'.format(i)
    predictions.to_csv(dataKey, index=False)
    my_bucket.upload_file(dataKey,Key=resultsPath + dataKey)
    os.remove(dataKey)
    
    # Calculate Char Indicies
    profile2 = dataSet2.copy()
    profile2['labels'] = predictions.astype(np.float64)
    charProfile = profile2.groupby(['labels']).mean().values
    characteristicIndicies = []
    for j in range(i):
        characteristicIndicies.append(np.argmin(np.linalg.norm(dataSet2-charProfile[j],axis=1)))

    collectedIndicies.append(np.flip(characteristicIndicies,axis=0))
    
    print('KMA with {} centres took {} seconds'.format(i,time.time()-timestart))
    
#Uploading Char Indicies
indiciesPath = '{}/{}/{}/{}/{}/'.format(dataSource, Coor, Interp, Analysis, Indicies)
dataKey = '{}DimCharIndicies.npy'.format(Dimensions)
np.save(dataKey,collectedIndicies)
my_bucket.upload_file(dataKey,Key=indiciesPath + dataKey)
os.remove(dataKey)

KMA with 2 centres took 4.340428829193115 seconds
KMA with 3 centres took 4.881385564804077 seconds
KMA with 4 centres took 5.102336168289185 seconds
KMA with 5 centres took 6.383940935134888 seconds
KMA with 6 centres took 5.8996055126190186 seconds
KMA with 7 centres took 5.906062602996826 seconds
KMA with 8 centres took 7.4120261669158936 seconds
KMA with 9 centres took 8.774024248123169 seconds
KMA with 10 centres took 8.278908729553223 seconds
KMA with 11 centres took 6.995883464813232 seconds
KMA with 12 centres took 7.376529216766357 seconds
KMA with 13 centres took 8.379830598831177 seconds
KMA with 14 centres took 8.851609468460083 seconds
KMA with 15 centres took 11.2695152759552 seconds
KMA with 16 centres took 11.814684629440308 seconds
KMA with 17 centres took 9.977014064788818 seconds
KMA with 18 centres took 10.128883361816406 seconds
KMA with 19 centres took 11.601832151412964 seconds
KMA with 20 centres took 11.520936250686646 seconds
KMA with 21 centres took 11.377424

KMA with 405 centres took 103.03257346153259 seconds
KMA with 410 centres took 100.35106754302979 seconds
KMA with 415 centres took 106.7320077419281 seconds
KMA with 420 centres took 106.50774455070496 seconds
KMA with 425 centres took 108.82374358177185 seconds
KMA with 430 centres took 106.26948118209839 seconds
KMA with 435 centres took 108.54064631462097 seconds
KMA with 440 centres took 112.0652403831482 seconds
KMA with 445 centres took 111.3667893409729 seconds
KMA with 450 centres took 111.85944485664368 seconds
KMA with 455 centres took 109.47186350822449 seconds
KMA with 460 centres took 116.41600060462952 seconds
KMA with 465 centres took 116.26612663269043 seconds
KMA with 470 centres took 117.80363988876343 seconds
KMA with 475 centres took 117.61896109580994 seconds
KMA with 480 centres took 120.01164531707764 seconds
KMA with 485 centres took 116.07660174369812 seconds
KMA with 490 centres took 120.85137367248535 seconds
KMA with 495 centres took 118.65988945960999 seco