In [1]:
# Always needs to be done in Rasberry Pi
import findspark
findspark.init('/home/baxman/spark-2.4.7-bin-hadoop2.7')
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('hacker').getOrCreate()

In [2]:
# Import data
data = spark.read.csv('/home/baxman/Codes/PySpark/Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Clustering/hack_data.csv', header = True, inferSchema = True)

In [3]:
data.printSchema()

root
 |-- Session_Connection_Time: double (nullable = true)
 |-- Bytes Transferred: double (nullable = true)
 |-- Kali_Trace_Used: integer (nullable = true)
 |-- Servers_Corrupted: double (nullable = true)
 |-- Pages_Corrupted: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- WPM_Typing_Speed: double (nullable = true)



In [4]:
data.show()

+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+
|Session_Connection_Time|Bytes Transferred|Kali_Trace_Used|Servers_Corrupted|Pages_Corrupted|            Location|WPM_Typing_Speed|
+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+
|                    8.0|           391.09|              1|             2.96|            7.0|            Slovenia|           72.37|
|                   20.0|           720.99|              0|             3.04|            9.0|British Virgin Is...|           69.08|
|                   31.0|           356.32|              1|             3.71|            8.0|             Tokelau|           70.58|
|                    2.0|           228.08|              1|             2.48|            8.0|             Bolivia|            70.8|
|                   20.0|            408.5|              0|             3.57

In [5]:
# Import LinReg from MLlib
from pyspark.ml.clustering import KMeans

In [6]:
# Format data
from pyspark.ml.feature import VectorAssembler

In [7]:
data.columns

['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'Location',
 'WPM_Typing_Speed']

In [8]:
# Location is useless -> probable VPN use

In [9]:
assembler = VectorAssembler(inputCols = ['Session_Connection_Time',
                                         'Bytes Transferred',
                                         'Kali_Trace_Used',
                                         'Servers_Corrupted',
                                         'Pages_Corrupted',
                                         'WPM_Typing_Speed'], outputCol = 'features')

In [10]:
final_data = assembler.transform(data)

In [11]:
final_data.show()

+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+--------------------+
|Session_Connection_Time|Bytes Transferred|Kali_Trace_Used|Servers_Corrupted|Pages_Corrupted|            Location|WPM_Typing_Speed|            features|
+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+--------------------+
|                    8.0|           391.09|              1|             2.96|            7.0|            Slovenia|           72.37|[8.0,391.09,1.0,2...|
|                   20.0|           720.99|              0|             3.04|            9.0|British Virgin Is...|           69.08|[20.0,720.99,0.0,...|
|                   31.0|           356.32|              1|             3.71|            8.0|             Tokelau|           70.58|[31.0,356.32,1.0,...|
|                    2.0|           228.08|              1|             2.48|     

In [12]:
# Scale data
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler(inputCol = 'features',outputCol = 'scaled_features')
scaler_model = scaler.fit(final_data)
final_data = scaler_model.transform(final_data)
final_data.show()

+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+--------------------+--------------------+
|Session_Connection_Time|Bytes Transferred|Kali_Trace_Used|Servers_Corrupted|Pages_Corrupted|            Location|WPM_Typing_Speed|            features|     scaled_features|
+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+--------------------+--------------------+
|                    8.0|           391.09|              1|             2.96|            7.0|            Slovenia|           72.37|[8.0,391.09,1.0,2...|[0.56785108466505...|
|                   20.0|           720.99|              0|             3.04|            9.0|British Virgin Is...|           69.08|[20.0,720.99,0.0,...|[1.41962771166263...|
|                   31.0|           356.32|              1|             3.71|            8.0|             Tokelau|           70.58

In [17]:
#Instantiate models -> increasing K = more clusters, wssse goes down
kmeans2 = KMeans(featuresCol = 'scaled_features',k=2)
kmeans3 = KMeans(featuresCol = 'scaled_features',k=3)
model_k2 = kmeans2.fit(final_data)
model_k3 = kmeans3.fit(final_data)

In [18]:
print('K2 WSSSE is:')
print(model_k2.computeCost(final_data))

K2 WSSSE is:
601.7707512676716


In [19]:
print('K3 WSSSE is:')
print(model_k3.computeCost(final_data))

K3 WSSSE is:
434.75507308487647


In [22]:
# Get centers
centers_k2 = model_k2.clusterCenters()
print(centers_k2)

[array([1.26023837, 1.31829808, 0.99280765, 1.36491885, 2.5625043 ,
       5.26676612]), array([2.99991988, 2.92319035, 1.05261534, 3.20390443, 4.51321315,
       3.28474   ])]


In [23]:
# Get centers
centers_k3 = model_k3.clusterCenters()
print(centers_k3)

[array([3.05623261, 2.95754486, 1.99757683, 3.2079628 , 4.49941976,
       3.26738378]), array([1.26023837, 1.31829808, 0.99280765, 1.36491885, 2.5625043 ,
       5.26676612]), array([2.93719177, 2.88492202, 0.        , 3.19938371, 4.52857793,
       3.30407351])]


In [28]:
# Transform data and make predictions
results_k2 = model_k2.transform(final_data)
results_k2.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         0|  167|
+----------+-----+



In [29]:
# Transform data and make predictions
results_k3 = model_k3.transform(final_data)
results_k3.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         2|   79|
|         0|   88|
+----------+-----+



In [None]:
# Looking at counts, it can be seen that 3 clusters = not even, so K = 2 