In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import os
import sys
import time

import matplotlib.pyplot as plt
import seaborn as sns


import itertools
from sklearn.preprocessing import scale
from sklearn import preprocessing

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('ops').getOrCreate()
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import PCA
from pyspark.sql import functions as F
from pyspark.sql import types as T
from  pyspark.sql.functions import abs, pow
from pyspark.sql.types import FloatType

import boto3
from io import BytesIO
import sagemaker.amazon.common as smac
from sagemaker import get_execution_role

# STEPS

1. Import Data
2. Create umbrella directory
3. Vectorize features

      a. Apply PCA to features range 1 - 100
            i. Create PCA directoy
            ii. Apply Kmeans to each PCA range (2,101,1) , (105,2001,5)
                
                I. save output

In [2]:
'''
-----------------------------------------------------------------------------
--- Inptuts
-----------------------------------------------------------------------------
'''
Analysis = 'PCA'
Interp = 'NoInterp'
dataSource = 'L6'
Coor = 'Char'
dataSetName = '{}dataScaled.csv'.format(dataSource)

DRrange = range(100,101,1)
clusterRange1 = range(2,4,1)
clusterRange2 = range(105,111,5)


In [3]:
'''
-----------------------------------------------------------------------------
--- Definitions
-----------------------------------------------------------------------------
'''
Results = 'Results'
Indicies = 'Indicies'
Response = 'Response'
Figures = 'Figures'

# Configuring S3
s3_bucket_name = 'jasper-ml-sagemaker'
role = get_execution_role()

client = boto3.client('s3')
resource = boto3.resource('s3')
my_bucket = resource.Bucket(s3_bucket_name)

In [8]:
dataPath = 'Data/'

# Dataset
dataKey = dataSetName
obj = client.get_object(Bucket=s3_bucket_name, Key=dataPath + dataKey)
pd.read_csv(obj['Body'], index_col=None).to_csv(dataKey, index=False)
dataSet = spark.read.csv(dataKey,inferSchema = True)

# Response Data
dataKey = '{}FixedResponse.npy'.format(dataSource)
obj = client.get_object(Bucket=s3_bucket_name, Key=dataPath + dataKey)
responseData = np.load(BytesIO(obj['Body'].read()))

In [9]:
# Vectorizing features
assembler = VectorAssembler(inputCols= dataSet.columns,
                           outputCol = 'features')
vectorisedData = assembler.transform(dataSet)

In [13]:
# For PCA in range 2-101
for j in DRrange:
    
    collectedIndicies = []
    Dimensions = j
    resultsPath = '{}/{}/{}/{}/{}/{}/'.format(dataSource,Coor,Interp,Analysis,Results,Dimensions)
    
    # Perform PCA on the data
    pca = PCA(k=j, inputCol="features", outputCol="pcaFeatures")
    PCAModel = pca.fit(vectorisedData)
    
    PCAData = PCAModel.transform(vectorisedData).select("pcaFeatures")
    to_array = F.udf(lambda v: v.toArray().tolist(), T.ArrayType(T.FloatType()))
    profileArray = PCAData.withColumn('pcaFeatures', to_array('pcaFeatures')).toPandas()
    profile = pd.DataFrame(profileArray.pcaFeatures.values.tolist(), index= profileArray.index)
    
    for i in clusterRange1:
        #  Perform Kmeans on the data
        timestart = time.time()
        kmeans = KMeans(k=i, maxIter= 2000,featuresCol='pcaFeatures')
        kMeansModel = kmeans.fit(PCAData)
        predictions = kMeansModel.transform(PCAData).select('prediction').toPandas()

        # save at each stage to reduce memory consumption
        dataKey = '{}results.csv'.format(i)
        predictions.to_csv(dataKey, index=False)
        my_bucket.upload_file(dataKey,Key=resultsPath + dataKey)
        os.remove(dataKey)
        
        print('PCA {} elements & {} centres took {} seconds'.format(j,i,time.time()-timestart))
        
        # Calculate characteristic indicies
        timeStartIndicies = time.time()
        profile2 = profile.copy()
        profile2['labels'] = predictions.astype(np.float64)
        charProfile = profile2.groupby(['labels']).mean().values
        characteristicIndicies = []
        for ii in range(i):
            characteristicIndicies.append(np.argmin(np.linalg.norm(profile-charProfile[ii],axis=1)))
            
        collectedIndicies.append(np.flip(characteristicIndicies,axis=0))
        print('Completed indicies for Dim = {} & k = {} in {:.2f} s'.format(Dimensions, i, time.time() - timeStartIndicies))

    
    for i in clusterRange2:
        timestart = time.time()
        kmeans = KMeans(k=i, maxIter= 2000,featuresCol='pcaFeatures')
        kMeansModel = kmeans.fit(PCAData)
        predictions = kMeansModel.transform(PCAData).select('prediction').toPandas()

        
        # save at each stage to reduce memory consumption
        dataKey = '{}results.csv'.format(i)
        predictions.to_csv(dataKey, index=False)
        my_bucket.upload_file(dataKey,Key=resultsPath + dataKey)
        os.remove(dataKey)
        
        print('PCA {} elements & {} centres took {} seconds'.format(j,i,time.time()-timestart))
        
        # Calculate characteristic indicies
        timeStartIndicies = time.time()
        profile2 = profile.copy()
        profile2['labels'] = predictions.astype(np.float64)
        charProfile = profile2.groupby(['labels']).mean().values
        characteristicIndicies = []
        for ii in range(i):
            characteristicIndicies.append(np.argmin(np.linalg.norm(profile-charProfile[ii],axis=1)))
            
        collectedIndicies.append(np.flip(characteristicIndicies,axis=0))
        print('Completed indicies for Dim = {} & k = {}'.format(Dimensions, i, time.time() - timeStartIndicies))
    
    #Uploading Char Indicies
    indiciesPath = '{}/{}/{}/{}/{}/'.format(dataSource, Coor, Interp, Analysis, Indicies)
    dataKey = '{}DimCharIndicies.npy'.format(Dimensions)
    np.save(dataKey,collectedIndicies)
    my_bucket.upload_file(dataKey,Key=indiciesPath + dataKey)
    os.remove(dataKey)

PCA 100 elements & 2 centres took 10.678924322128296 seconds
Completed indicies for Dim = 100 & k = 2 in 0.11 s
PCA 100 elements & 3 centres took 10.956296443939209 seconds
Completed indicies for Dim = 100 & k = 3 in 0.13 s
PCA 100 elements & 105 centres took 16.528581380844116 seconds
Completed indicies for Dim = 100 & k = 105
PCA 100 elements & 110 centres took 14.225117444992065 seconds
Completed indicies for Dim = 100 & k = 110


In [None]:
to_array = F.udf(lambda v: v.toArray().tolist(), T.ArrayType(T.FloatType()))
profile = PCAData.withColumn('pcaFeatures', to_array('pcaFeatures')).toPandas()
profile = pd.DataFrame(df.pcaFeatures.values.tolist(), index= df.index)

In [None]:
predictions = kMeansModel.transform(PCAData).select('prediction').toPandas()
profile2 = profile.copy()
profile2['labels'] = predictions.astype(np.float64)
charProfile = profile2.groupby(['labels']).mean().values
characteristicIndicies = []
for ii in range(i):
    characteristicIndicies.append(np.argmin(np.linalg.norm(pred_auto-charProfile[jj],axis=1)))