In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator

In [4]:
spark = SparkSession.builder.appName("Seed Clustering").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/02 13:11:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/12/02 13:11:27 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [5]:
df = spark.read.csv('seeds.csv', header=True, inferSchema=True)
df.show(5)

                                                                                

+-----+---------+-----------+----------------+---------------+---------------------+-----------------------+
| area|perimeter|compactness|length of kernel|width of kernel|asymmetry coefficient|length of kernel groove|
+-----+---------+-----------+----------------+---------------+---------------------+-----------------------+
|15.26|    14.84|      0.871|           5.763|          3.312|                2.221|                   5.22|
|14.88|    14.57|     0.8811|           5.554|          3.333|                1.018|                  4.956|
|14.29|    14.09|      0.905|           5.291|          3.337|                2.699|                  4.825|
|13.84|    13.94|     0.8955|           5.324|          3.379|                2.259|                  4.805|
|16.14|    14.99|     0.9034|           5.658|          3.562|                1.355|                  5.175|
+-----+---------+-----------+----------------+---------------+---------------------+-----------------------+
only showing top 5 

In [6]:
df.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length of kernel: double (nullable = true)
 |-- width of kernel: double (nullable = true)
 |-- asymmetry coefficient: double (nullable = true)
 |-- length of kernel groove: double (nullable = true)



In [7]:
feature_cols = df.columns
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
df_transformed = assembler.transform(df)

In [8]:
kmeans = KMeans(k=3, featuresCol="features", predictionCol="prediction")
model = kmeans.fit(df_transformed)

23/12/02 13:11:44 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


In [9]:
predictions = model.transform(df_transformed)

#### The Silhouette Score is a metric used to calculate the goodness of a clustering technique, such as K-means clustering. It measures how similar an object is to its own cluster (cohesion) compared to other clusters (separation). ####

In [10]:
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print("Silhouette Score = ", silhouette)

Silhouette Score =  0.6632174368466238


#### Finding the best parameters for K-means in PySpark involves performing a hyperparameter search, commonly known as hyperparameter tuning. You can use techniques like grid search or random search to explore different combinations of hyperparameters and find the ones that result in the best performance.

In [11]:
kmeans = KMeans()

param_grid = (ParamGridBuilder().addGrid(kmeans.k, [2,3,4,5]).addGrid(kmeans.maxIter, [10,20,30,40]).build())
evaluator = ClusteringEvaluator()
crossval = CrossValidator(estimator=kmeans,
                          estimatorParamMaps=param_grid,
                          evaluator=evaluator,
                          numFolds=5)

cv_model = crossval.fit(df_transformed)

best_kmeans_model = cv_model.bestModel

print("Best Parameters:")
print(f"K: {best_kmeans_model.getK()}")
print(f"Max Iterations: {best_kmeans_model.getMaxIter()}")

Best Parameters:
K: 2
Max Iterations: 10


In [12]:
kmeans = KMeans(k=2, maxIter=10, seed=42, featuresCol="features", predictionCol="prediction")

In [13]:
model = kmeans.fit(df_transformed)

In [14]:
predictions = model.transform(df_transformed)

In [15]:
predictions.show(n=5, truncate=False)

+-----+---------+-----------+----------------+---------------+---------------------+-----------------------+--------------------------------------------+----------+
|area |perimeter|compactness|length of kernel|width of kernel|asymmetry coefficient|length of kernel groove|features                                    |prediction|
+-----+---------+-----------+----------------+---------------+---------------------+-----------------------+--------------------------------------------+----------+
|15.26|14.84    |0.871      |5.763           |3.312          |2.221                |5.22                   |[15.26,14.84,0.871,5.763,3.312,2.221,5.22]  |1         |
|14.88|14.57    |0.8811     |5.554           |3.333          |1.018                |4.956                  |[14.88,14.57,0.8811,5.554,3.333,1.018,4.956]|1         |
|14.29|14.09    |0.905      |5.291           |3.337          |2.699                |4.825                  |[14.29,14.09,0.905,5.291,3.337,2.699,4.825] |1         |
|13.84|13.

In [16]:
predictions.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  134|
|         0|   76|
+----------+-----+



In [17]:
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print("Silhouette Score = ", silhouette)

Silhouette Score =  0.7161978322145462


In [18]:
spark.stop()

#### Interpretation of Silhouette Score:

##### Close to +1: Indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters.

##### Close to 0: Indicates that the object is on or very close to the decision boundary between two neighboring clusters.

##### Close to -1: Indicates that the object may be assigned to the wrong cluster.

#### Conclusion: By performing hyperparameter search, we got a higher Silhouette Score indicating close to +1 which means the model is performing well.