In [None]:
import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('hack_find').getOrCreate()

In [None]:
from pyspark.ml.clustering import KMeans

# Loads data
data = spark.read.csv('data/hack_data.csv', header = True, 
                         inferSchema = True)

In [None]:
data.head()

Row(Session_Connection_Time=8.0, Bytes Transferred=391.09, Kali_Trace_Used=1, Servers_Corrupted=2.96, Pages_Corrupted=7.0, Location='Slovenia', WPM_Typing_Speed=72.37)

In [None]:
data.describe().show()

+-------+-----------------------+------------------+------------------+-----------------+------------------+-----------+------------------+
|summary|Session_Connection_Time| Bytes Transferred|   Kali_Trace_Used|Servers_Corrupted|   Pages_Corrupted|   Location|  WPM_Typing_Speed|
+-------+-----------------------+------------------+------------------+-----------------+------------------+-----------+------------------+
|  count|                    334|               334|               334|              334|               334|        334|               334|
|   mean|     30.008982035928145| 607.2452694610777|0.5119760479041916|5.258502994011977|10.838323353293413|       null|57.342395209580864|
| stddev|     14.088200614636158|286.33593163576757|0.5006065264451406| 2.30190693339697|  3.06352633036022|       null| 13.41106336843464|
|    min|                    1.0|              10.0|                 0|              1.0|               6.0|Afghanistan|              40.0|
|    max|           

In [None]:
data.columns

['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'Location',
 'WPM_Typing_Speed']

In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [None]:
feat_cols = ['Session_Connection_Time',
             'Bytes Transferred',
             'Kali_Trace_Used', 
             'Servers_Corrupted',
             'Pages_Corrupted', 
             'WPM_Typing_Speed']

In [None]:
vec_assembler = VectorAssembler(inputCols = feat_cols,
                                outputCol = 'features')

In [None]:
final_data = vec_assembler.transform(data)

In [None]:
from pyspark.ml.feature import StandardScaler

In [None]:
scaler = StandardScaler(inputCol = 'features',
                        outputCol = 'scaledFeatures',
                        withStd = True, 
                        withMean = False)

In [None]:
# Compute summary statistics by fitting the StandardScaler
scalerModel = scaler.fit(final_data)

In [None]:
# Normalize each feature to have unit standard deviation
cluster_final_data = scalerModel.transform(final_data)

** Time to find out whether is 2 or 3! **

In [None]:
kmeans3 = KMeans(featuresCol = 'scaledFeatures', k = 3)
kmeans2 = KMeans(featuresCol = 'scaledFeatures', k = 2)

In [None]:
model_k3 = kmeans3.fit(cluster_final_data)
model_k2 = kmeans2.fit(cluster_final_data)

In [None]:
wssse_k3 = model_k3.computeCost(cluster_final_data)
wssse_k2 = model_k2.computeCost(cluster_final_data)

In [None]:
print('With K = 3')
print('Within Set Sum of Squared Errors = ' + str(wssse_k3))
print('--'*30)
print('With K = 2')
print('Within Set Sum of Squared Errors = ' + str(wssse_k2))

With K = 3
Within Set Sum of Squared Errors = 434.75507308487647
------------------------------------------------------------
With K = 2
Within Set Sum of Squared Errors = 601.7707512676716


Not much to be gained from the WSSSE, after all, we would expect that as K increases, the WSSSE decreases. We could howerver continue the analysis by seeing the drop from K = 3, K = 4 to check if the clustering favors even or odd numbers. This won't be substantial, but its worth a look:

In [None]:
for k in range(2, 9):
    kmeans = KMeans(featuresCol = 'scaledFeatures', k = k)
    model = kmeans.fit(cluster_final_data)
    wssse = model.computeCost(cluster_final_data)
    print('With K = {}'.format(k))
    print('Within Set Sum of Squared Errors = ' + str(wssse))
    print('--'*30)

With K = 2
Within Set Sum of Squared Errors = 601.7707512676716
------------------------------------------------------------
With K = 3
Within Set Sum of Squared Errors = 434.75507308487647
------------------------------------------------------------
With K = 4
Within Set Sum of Squared Errors = 267.1336116887891
------------------------------------------------------------
With K = 5
Within Set Sum of Squared Errors = 247.80143915458297
------------------------------------------------------------
With K = 6
Within Set Sum of Squared Errors = 232.6312088685911
------------------------------------------------------------
With K = 7
Within Set Sum of Squared Errors = 205.55236311802878
------------------------------------------------------------
With K = 8
Within Set Sum of Squared Errors = 189.49216607656717
------------------------------------------------------------


** Nothing definitive can be said with the above, but wait! The last key fact that the engineer mentioned was that the  attacks should be evenly numbered between the hackers! Let's check with the transform and prediction columns that result form this! Congratulations if you made this connection, it was quite tricky given what we've covered **

In [None]:
model_k3.transform(cluster_final_data).groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         2|   88|
|         0|   79|
+----------+-----+



In [None]:
model_k2.transform(cluster_final_data).groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         0|  167|
+----------+-----+



## <strong>It was 2 hackers, in fact, our clustering algorith created two equally sized clusters with K = 2, no way that is a coincidence!</strong>