In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('ops').getOrCreate()
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
import os
import time
import pandas as pd
from pyspark.ml.feature import PCA

import boto3
from io import BytesIO
import sagemaker.amazon.common as smac
from sagemaker import get_execution_role

# STEPS

1. Import Data
2. Create umbrella directory
3. Vectorize features

      a. Apply PCA to features range 1 - 100
            i. Create PCA directoy
            ii. Apply Kmeans to each PCA range (2,101,1) , (105,2001,5)
                
                I. save output

In [2]:
'''
-----------------------------------------------------------------------------
--- Inptuts
-----------------------------------------------------------------------------
'''
Interp = 'NoInterp'
s3DataSet = 'scaledData.csv'

PCARange = range(70,101,1)
clusterRange1 = range(2,101,1)
clusterRange2 = range(105,1001,5)


In [3]:
'''
-----------------------------------------------------------------------------
--- Definitions
-----------------------------------------------------------------------------
'''
Results = 'Results'
Analysis = 'PCA'

# Configuring S3
s3_bucket_name = 'jasper-ml-sagemaker'
role = get_execution_role()

client = boto3.client('s3')
resource = boto3.resource('s3')
my_bucket = resource.Bucket(s3_bucket_name)

In [4]:
# Importing Data
dataKey = s3DataSet
obj = client.get_object(Bucket=s3_bucket_name, Key=dataKey)
pd.read_csv(obj['Body'], header=None, index_col=None).to_csv(dataKey, index=False)
dataset = spark.read.csv(dataKey,inferSchema = True)

In [5]:
# Vectorizing features
assembler = VectorAssembler(inputCols= dataset.columns,
                           outputCol = 'features')
vectorisedData = assembler.transform(dataset)

In [None]:
# For PCA in range 2-101
for j in PCARange:
    
    Dimensions = j
    resultsPath = '{}/{}/{}/{}/'.format(Results, Interp, Analysis, Dimensions)
    
    # Perform PCA on the data
    pca = PCA(k=j, inputCol="features", outputCol="pcaFeatures")
    PCAModel = pca.fit(vectorisedData)
    
    PCAData = PCAModel.transform(vectorisedData).select("pcaFeatures")
    
    for i in clusterRange1:
        #  Perform Kmeans on the data
        timestart = time.time()
        kmeans = KMeans(k=i, maxIter= 2000,featuresCol='pcaFeatures')
        kMeansModel = kmeans.fit(PCAData)
        
        # save at each stage to reduce memory consumption
        dataKey = '{}results.csv'.format(i)
        kMeansModel.transform(PCAData).select('prediction').toPandas().to_csv(dataKey, index=False)
        my_bucket.upload_file(dataKey,Key=resultsPath + dataKey)
        os.remove(dataKey)
        
        print('PCA {} elements & {} centres took {} seconds'.format(j,i,time.time()-timestart))
    
    for i in clusterRange2:
        timestart = time.time()
        kmeans = KMeans(k=i, maxIter= 2000,featuresCol='pcaFeatures')
        kMeansModel = kmeans.fit(PCAData)
        
        # save at each stage to reduce memory consumption
        dataKey = '{}results.csv'.format(i)
        kMeansModel.transform(PCAData).select('prediction').toPandas().to_csv(dataKey, index=False)
        my_bucket.upload_file(dataKey,Key=resultsPath + dataKey)
        os.remove(dataKey)
        
        print('PCA {} elements & {} centres took {} seconds'.format(j,i,time.time()-timestart))

PCA 70 elements & 2 centres took 4.072444438934326 seconds
PCA 70 elements & 3 centres took 2.77089524269104 seconds
PCA 70 elements & 4 centres took 2.97157621383667 seconds
PCA 70 elements & 5 centres took 3.117504596710205 seconds
PCA 70 elements & 6 centres took 4.300659894943237 seconds
PCA 70 elements & 7 centres took 3.3565211296081543 seconds
PCA 70 elements & 8 centres took 3.0069150924682617 seconds
PCA 70 elements & 9 centres took 3.1150949001312256 seconds
PCA 70 elements & 10 centres took 2.569612979888916 seconds
PCA 70 elements & 11 centres took 3.3332366943359375 seconds
PCA 70 elements & 12 centres took 2.83190655708313 seconds
PCA 70 elements & 13 centres took 3.891472816467285 seconds
PCA 70 elements & 14 centres took 3.4414939880371094 seconds
PCA 70 elements & 15 centres took 2.8558969497680664 seconds
PCA 70 elements & 16 centres took 3.623469114303589 seconds
PCA 70 elements & 17 centres took 3.711143970489502 seconds
PCA 70 elements & 18 centres took 3.658209085

PCA 70 elements & 290 centres took 6.505471229553223 seconds
PCA 70 elements & 295 centres took 4.833523988723755 seconds
PCA 70 elements & 300 centres took 6.029895782470703 seconds
PCA 70 elements & 305 centres took 6.960510492324829 seconds
PCA 70 elements & 310 centres took 4.990048408508301 seconds
PCA 70 elements & 315 centres took 5.473577499389648 seconds
PCA 70 elements & 320 centres took 6.081095933914185 seconds
PCA 70 elements & 325 centres took 5.710643529891968 seconds
PCA 70 elements & 330 centres took 5.429255485534668 seconds
PCA 70 elements & 335 centres took 8.323692560195923 seconds
PCA 70 elements & 340 centres took 5.428043842315674 seconds
PCA 70 elements & 345 centres took 5.754020929336548 seconds
PCA 70 elements & 350 centres took 6.526065349578857 seconds
PCA 70 elements & 355 centres took 6.727956056594849 seconds
PCA 70 elements & 360 centres took 6.719928503036499 seconds
PCA 70 elements & 365 centres took 6.570165157318115 seconds
PCA 70 elements & 370 ce

PCA 70 elements & 965 centres took 9.608792543411255 seconds
PCA 70 elements & 970 centres took 10.410907983779907 seconds
PCA 70 elements & 975 centres took 9.883821964263916 seconds
PCA 70 elements & 980 centres took 9.173968315124512 seconds
PCA 70 elements & 985 centres took 8.37895941734314 seconds
PCA 70 elements & 990 centres took 9.23768401145935 seconds
PCA 70 elements & 995 centres took 10.354368925094604 seconds
PCA 70 elements & 1000 centres took 10.245283842086792 seconds
PCA 71 elements & 2 centres took 2.1386988162994385 seconds
PCA 71 elements & 3 centres took 2.139894962310791 seconds
PCA 71 elements & 4 centres took 2.1956121921539307 seconds
PCA 71 elements & 5 centres took 2.3278074264526367 seconds
PCA 71 elements & 6 centres took 3.5920567512512207 seconds
PCA 71 elements & 7 centres took 2.737473964691162 seconds
PCA 71 elements & 8 centres took 2.5824503898620605 seconds
PCA 71 elements & 9 centres took 2.7218098640441895 seconds
PCA 71 elements & 10 centres too

PCA 71 elements & 250 centres took 6.274431467056274 seconds
PCA 71 elements & 255 centres took 4.5382513999938965 seconds
PCA 71 elements & 260 centres took 5.922626495361328 seconds
PCA 71 elements & 265 centres took 5.643138885498047 seconds
PCA 71 elements & 270 centres took 5.561656475067139 seconds
PCA 71 elements & 275 centres took 5.083366870880127 seconds
PCA 71 elements & 280 centres took 5.645153045654297 seconds
PCA 71 elements & 285 centres took 5.35500955581665 seconds
PCA 71 elements & 290 centres took 6.770225763320923 seconds
PCA 71 elements & 295 centres took 5.684632062911987 seconds
PCA 71 elements & 300 centres took 6.982269287109375 seconds
PCA 71 elements & 305 centres took 6.0176026821136475 seconds
PCA 71 elements & 310 centres took 5.673459529876709 seconds
PCA 71 elements & 315 centres took 5.548643112182617 seconds
PCA 71 elements & 320 centres took 5.704270124435425 seconds
PCA 71 elements & 325 centres took 6.3028786182403564 seconds
PCA 71 elements & 330 