In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('hackerclustering').master('local[4]').getOrCreate()

In [3]:
data = spark.read.csv('../data/hack_data.csv', inferSchema=True, header=True)

In [4]:
# is the third suspect involved in the attack?
# can we use clustering to try to identify this?
# forensic engineer knows hackers trade off attacks,
# i.e. they should have the same number of hacks each

In [6]:
data.printSchema()

root
 |-- Session_Connection_Time: double (nullable = true)
 |-- Bytes Transferred: double (nullable = true)
 |-- Kali_Trace_Used: integer (nullable = true)
 |-- Servers_Corrupted: double (nullable = true)
 |-- Pages_Corrupted: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- WPM_Typing_Speed: double (nullable = true)



In [11]:
# we have a cateogorical attribute, location
# we can drop this because hackers are using VPNs and location is not reliable

In [12]:
data = data.drop('Location')

In [9]:
# everything is numeric, no need to encode
# we should perform feature scaling so no single feature dominates distance metric

### Create Feature Set

In [7]:
from pyspark.ml.feature import VectorAssembler

In [14]:
assembler = VectorAssembler(inputCols=data.columns, outputCol='features')
with_features = assembler.transform(data).select('features')

### Scale Data

In [15]:
from pyspark.ml.feature import StandardScaler

In [19]:
scaler = StandardScaler(inputCol='features', outputCol='scaled_featurs')
scaled_data = scaler.fit(with_features).transform(with_features)

### Train KMeans Model

In [21]:
from pyspark.ml.clustering import KMeans

In [23]:
# first we will check clustering of 2 hackers
kmeans = KMeans(featuresCol='scaled_featurs', k=2)
model_k2 = kmeans.fit(scaled_data)

In [24]:
model_k2.computeCost(scaled_data)

601.7707512676716

In [25]:
results = model_k2.transform(scaled_data)

In [28]:
results.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         0|  167|
+----------+-----+



In [30]:
# now we will try for 3 clusters
kmeans = KMeans(featuresCol='scaled_featurs', k=3)
model_k3 = kmeans.fit(scaled_data)

model_k3.computeCost(scaled_data)

434.1492898715845

In [31]:
results = model_k3.transform(scaled_data)
results.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|   84|
|         2|   83|
|         0|  167|
+----------+-----+



In [32]:
# since the forensics team told us the hackers would have an equal number of attacks
# we can conclude that there are only 2 hackers