In [52]:
## Import Libraries
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer, StandardScaler
from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType, StringType
from pyspark.ml.clustering import KMeans

## Set seed
seed = 42

In [53]:
## Create Spark Session
spark = SparkSession.builder.appName('kmConsProject').getOrCreate()

In [54]:
## Setup Schema
schema = StructType(fields=[StructField('session_connection_time', DoubleType(), True),
                            StructField('bytes_transferred', DoubleType(), True),
                            StructField('kali_trace_used', IntegerType(), True),
                            StructField('servers_corrupted', DoubleType(), True),
                            StructField('pages_corrupted', DoubleType(), True),
                            StructField('location', StringType(), True),
                            StructField('wpm_typing_speed', DoubleType(), True)])

In [55]:
## Load Data
df = spark.read.csv('gs://spark-training-data/datasets/hack_data.csv', header=True, inferSchema=False, schema=schema)
df.show(5)
df.printSchema() ## Confirm proper schema

+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+
|session_connection_time|bytes_transferred|kali_trace_used|servers_corrupted|pages_corrupted|            location|wpm_typing_speed|
+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+
|                    8.0|           391.09|              1|             2.96|            7.0|            Slovenia|           72.37|
|                   20.0|           720.99|              0|             3.04|            9.0|British Virgin Is...|           69.08|
|                   31.0|           356.32|              1|             3.71|            8.0|             Tokelau|           70.58|
|                    2.0|           228.08|              1|             2.48|            8.0|             Bolivia|            70.8|
|                   20.0|            408.5|              0|             3.57

In [56]:
## StringIndex Location
indexer = StringIndexer(inputCol='location', outputCol='location_index')
df_indexed = indexer.fit(df).transform(df)
df_indexed.show(5)

+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+--------------+
|session_connection_time|bytes_transferred|kali_trace_used|servers_corrupted|pages_corrupted|            location|wpm_typing_speed|location_index|
+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+--------------+
|                    8.0|           391.09|              1|             2.96|            7.0|            Slovenia|           72.37|          88.0|
|                   20.0|           720.99|              0|             3.04|            9.0|British Virgin Is...|           69.08|          47.0|
|                   31.0|           356.32|              1|             3.71|            8.0|             Tokelau|           70.58|          92.0|
|                    2.0|           228.08|              1|             2.48|            8.0|             Bolivia|    

In [64]:
## Assembler & Create modeling df
assembler = VectorAssembler(inputCols=['session_connection_time','bytes_transferred','kali_trace_used',
                                       'servers_corrupted','pages_corrupted','wpm_typing_speed'],
                           outputCol='features')
output_features = assembler.transform(df_indexed)
output_features.head(1)

[Row(session_connection_time=8.0, bytes_transferred=391.09, kali_trace_used=1, servers_corrupted=2.96, pages_corrupted=7.0, location='Slovenia', wpm_typing_speed=72.37, location_index=88.0, features=DenseVector([8.0, 391.09, 1.0, 2.96, 7.0, 72.37]))]

In [65]:
## Setup Scaler & Scale Features
scaler = StandardScaler(inputCol='features', outputCol='scaled_features')
output_features_scaled = scaler.fit(output_features).transform(output_features)

In [66]:
## Setup Final Data
final_data = output_features_scaled.select(['scaled_features'])
final_data.show(5)

+--------------------+
|     scaled_features|
+--------------------+
|[0.56785108466505...|
|[1.41962771166263...|
|[2.20042295307707...|
|[0.14196277116626...|
|[1.41962771166263...|
+--------------------+
only showing top 5 rows



In [75]:
## Setup Model & Fit
kmeans2 = KMeans(featuresCol='scaled_features', k=2, seed=seed)
kmeans_model_2 = kmeans2.fit(final_data)

kmeans3 = KMeans(featuresCol='scaled_features', k=3, seed=seed)
kmeans_model_3 = kmeans3.fit(final_data)

In [76]:
## Evaluate KMeans Model
wssse_2 = kmeans_model_2.summary.trainingCost
print(wssse_2)

wssse_3 = kmeans_model_3.summary.trainingCost
print(wssse_3)

601.7707512676691
434.75507308487596


In [80]:
## Determine # of Hackers
kmeans_model_2.transform(final_data).groupBy('prediction').count().show()

kmeans_model_3.transform(final_data).groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         0|  167|
+----------+-----+

+----------+-----+
|prediction|count|
+----------+-----+
|         1|   88|
|         2|   79|
|         0|  167|
+----------+-----+

