In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").appName("ClusteringProj").getOrCreate()

In [2]:
from pyspark.ml.clustering import KMeans

data = spark.read.csv("hack_data.csv",header=True,inferSchema=True)

In [3]:
data.printSchema()

root
 |-- Session_Connection_Time: double (nullable = true)
 |-- Bytes Transferred: double (nullable = true)
 |-- Kali_Trace_Used: integer (nullable = true)
 |-- Servers_Corrupted: double (nullable = true)
 |-- Pages_Corrupted: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- WPM_Typing_Speed: double (nullable = true)



In [4]:
data.show(n=10)

+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+
|Session_Connection_Time|Bytes Transferred|Kali_Trace_Used|Servers_Corrupted|Pages_Corrupted|            Location|WPM_Typing_Speed|
+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+
|                    8.0|           391.09|              1|             2.96|            7.0|            Slovenia|           72.37|
|                   20.0|           720.99|              0|             3.04|            9.0|British Virgin Is...|           69.08|
|                   31.0|           356.32|              1|             3.71|            8.0|             Tokelau|           70.58|
|                    2.0|           228.08|              1|             2.48|            8.0|             Bolivia|            70.8|
|                   20.0|            408.5|              0|             3.57

In [5]:
data.columns

['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'Location',
 'WPM_Typing_Speed']

In [6]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [14]:
feat_cols = ['Session_Connection_Time', 'Bytes Transferred', 'Kali_Trace_Used',
             'Servers_Corrupted', 'Pages_Corrupted','WPM_Typing_Speed']

In [15]:
assembler = VectorAssembler(inputCols=feat_cols,outputCol="features")

In [17]:
final_data = assembler.transform(data)

In [18]:
from pyspark.ml.feature import StandardScaler

In [19]:
scaler = StandardScaler(inputCol="features",outputCol="scaledFeatures",withMean=True,withStd=True)

In [21]:
scaler_model = scaler.fit(final_data)

In [23]:
scaled_final_data = scaler_model.transform(final_data)

In [25]:
for k in range(2,10):
    kmeans = KMeans(featuresCol="scaledFeatures",k=k)
    model = kmeans.fit(scaled_final_data)
    wssse  = model.computeCost(scaled_final_data)
    print("with K={}".format(k))
    print("Within Set Sum of Squared Errors = " + str(wssse))
    model.transform(scaled_final_data).groupBy("prediction").count().show()
    print("--"*30)

with K=2
Within Set Sum of Squared Errors = 601.7707512676687
+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         0|  167|
+----------+-----+

------------------------------------------------------------
with K=3
Within Set Sum of Squared Errors = 434.7550730848762
+----------+-----+
|prediction|count|
+----------+-----+
|         1|   79|
|         2|   88|
|         0|  167|
+----------+-----+

------------------------------------------------------------
with K=4
Within Set Sum of Squared Errors = 267.13361168878964
+----------+-----+
|prediction|count|
+----------+-----+
|         1|   88|
|         3|   84|
|         2|   79|
|         0|   83|
+----------+-----+

------------------------------------------------------------
with K=5
Within Set Sum of Squared Errors = 253.10137429343868
+----------+-----+
|prediction|count|
+----------+-----+
|         1|   44|
|         3|   84|
|         4|   44|
|         2|   79|
|         0|   83|
+----------+-