In [35]:
# This notebook is my attempt at solving the clustering example in the "Spark and Python for Big data with Pyspark"
# udemy course

import findspark

findspark.init('/home/ubuntu/spark-2.4.4-bin-hadoop2.7')

from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, IntegerType, DoubleType, StructField, StructType
import pyspark.sql.functions as sparkf
from pyspark.ml.feature import VectorAssembler, StringIndexer, StandardScaler
from functools import reduce
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

def create_session(name: str):
    return SparkSession.builder.appName(name).getOrCreate()

In [10]:
# explicitly define schema
hacker_schema = [StructField("Session_Connection_Time", IntegerType(), nullable=True), 
                StructField("Bytes Transferred", DoubleType(), nullable=True),
                StructField("Kali_Trace_Used", IntegerType(), nullable=True),
                StructField("Servers_Corrupted", DoubleType(), nullable=True),
                StructField("Pages_Corrupted", IntegerType(), nullable=True),
                StructField("Location", StringType(), nullable=True),
                StructField("WPM_Typing_Speed", DoubleType(), nullable=True)]
hacker_struct = StructType(fields=hacker_schema)

In [22]:
# load dataframe from csv
hacker_session = create_session("hacker_example")
hacker_original_df = hacker_session.read.csv("hack_data.csv", inferSchema=True, header=True)
# create another dataframe that excludes the LOCATION column as it was suggested that wasn't useful
hacker_no_location_df = hacker_original_df.drop("Location")

# hacker_original_df.head(1)
# hacker_original_df.printSchema()
# [Row(Session_Connection_Time=None, Bytes_Transferred=None, Kali_Trace_Used=None, Servers_Corrupted=None, Pages_Corrupted=None, Location=None, WPM_Typing_Speed=None)]

In [23]:
# convert categorical location data into numerical data, only need HANDLEINVALID = "keep" if column
# has null values, still don't know why that works though or exactly what that does
indexer = StringIndexer(inputCol="Location", outputCol="Location_Index")
indexed_original = indexer.fit(hacker_original_df).transform(hacker_original_df)

# and then remove the string Location column as cannot add string data into a features vector
indexed_original = indexed_original.drop("Location")




In [24]:
# for simplicity we will just have 2 transformed datasets. (1) with the location (2) without location. 
# I want keep both as I'd like to compare the clustering performance to confirm if location is relevant or not
original_assembler = VectorAssembler(inputCols=indexed_original.columns, outputCol="features")
no_location_assembler = VectorAssembler(inputCols=hacker_no_location_df.columns, outputCol="features")

original_features = original_assembler.transform(indexed_original)
no_location_features = no_location_assembler.transform(hacker_no_location_df)

In [62]:
# Scaling the features vector so that differing scales of each column doesn't impact clustering (especially
# with the converted categorical column)
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")
scaler_model = scaler.fit(original_features)
final_original_data = scaler_model.transform(original_features)


# same thing but with the df that has no location
scaler_model = scaler.fit(no_location_features)
final_no_location_data = scaler_model.transform(no_location_features)
final_original_data.head(1)

[Row(Session_Connection_Time=8.0, Bytes Transferred=391.09, Kali_Trace_Used=1, Servers_Corrupted=2.96, Pages_Corrupted=7.0, WPM_Typing_Speed=72.37, Location_Index=54.0, features=DenseVector([8.0, 391.09, 1.0, 2.96, 7.0, 72.37, 54.0]), scaled_features=DenseVector([0.5679, 1.3658, 1.9976, 1.2859, 2.2849, 5.3963, 1.059]))]

In [64]:
# helper method that takes in list of tuples (<label for df>, <df>) and the labels for features column 
# that is the label for the column for the scaled features column
def fit_and_evaluate(dataframes, features_col):
    for df_label, df in dataframes:
        models_original = []
        # remember cannot make a 1 center Kmeans cluster model. As clustering meant to seperate into mulitiple groups    
        # make multiple k means models and evaluate them, to keep things quick we'll just loop 1 - 5 centers
        for i in range(2, 8, 1):
            kmeans = KMeans(featuresCol=features_col).setK(i).setSeed(1)
            model = kmeans.fit(df)
            predictions = model.transform(df)
            models_original.append((model, predictions))

        evaluator = ClusteringEvaluator()
        # evaluate the models (there's sum of squared distances metric which you can see its limitations 
        # as with more centers the distance gets smaller as we overfit with too many centers)
        # The Silhouette analysis metric helps us shed light on the optimal number of centers
        # Good example below of what the ClusteringEvaluator is doing and silhouette analysis
        # https://runawayhorse001.github.io/LearningApacheSpark/clustering.html
        # https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html
        print(f"evaluation metrics for dataset: {df_label}")
        for model, predictions in models_original:
            print(f"model with {len(model.clusterCenters())} center(s)")
            print(f"\t sum of squared distances of points to their nearest center: {model.computeCost(df):,}")
            print(f"\t Silhouette analysis: {evaluator.evaluate(predictions)}")
    
# 

In [65]:
# put the dataframes into a list to iterate through to keep code DRY. Still don't have a great way to 
# retrieve alias of DF
dataframes_to_analyze = [("no_location", final_no_location_data), ("with_location", final_original_data)]

fit_and_evaluate(dataframes_to_analyze, "scaled_features")


evaluation metrics for dataset: no_location
model with 2 center(s)
	 sum of squared distances of points to their nearest center: 601.7707512676716
	 Silhouette analysis: 0.6683623593283755
model with 3 center(s)
	 sum of squared distances of points to their nearest center: 434.75507308487647
	 Silhouette analysis: 0.30412315937808737
model with 4 center(s)
	 sum of squared distances of points to their nearest center: 267.1336116887891
	 Silhouette analysis: -0.04792891045570489
model with 5 center(s)
	 sum of squared distances of points to their nearest center: 406.0381118469415
	 Silhouette analysis: 0.10059152769091971
model with 6 center(s)
	 sum of squared distances of points to their nearest center: 227.5888199292027
	 Silhouette analysis: -0.10827640583392491
model with 7 center(s)
	 sum of squared distances of points to their nearest center: 207.04933720005226
	 Silhouette analysis: -0.13479542730344232
evaluation metrics for dataset: with_location
model with 2 center(s)
	 sum o