In [2]:
# Always needs to be done in Rasberry Pi
import findspark
findspark.init('/home/baxman/spark-2.4.7-bin-hadoop2.7')
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('cluster').getOrCreate()

In [4]:
# Import data
data = spark.read.csv('/home/baxman/Codes/PySpark/Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Clustering/seeds_dataset.csv', header = True, inferSchema = True)

In [5]:
data.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)



In [6]:
# Need to scale unlabaled data, then cluster with K = 3

In [7]:
# Import LinReg from MLlib
from pyspark.ml.clustering import KMeans

In [8]:
# Format data
from pyspark.ml.feature import VectorAssembler

In [10]:
data.columns

['area',
 'perimeter',
 'compactness',
 'length_of_kernel',
 'width_of_kernel',
 'asymmetry_coefficient',
 'length_of_groove']

In [11]:
assembler = VectorAssembler(inputCols = data.columns, outputCol = 'features')

In [13]:
final_data = assembler.transform(data)

In [14]:
final_data.show()

+-----+---------+-----------+------------------+------------------+---------------------+------------------+--------------------+
| area|perimeter|compactness|  length_of_kernel|   width_of_kernel|asymmetry_coefficient|  length_of_groove|            features|
+-----+---------+-----------+------------------+------------------+---------------------+------------------+--------------------+
|15.26|    14.84|      0.871|             5.763|             3.312|                2.221|              5.22|[15.26,14.84,0.87...|
|14.88|    14.57|     0.8811| 5.553999999999999|             3.333|                1.018|             4.956|[14.88,14.57,0.88...|
|14.29|    14.09|      0.905|             5.291|3.3369999999999997|                2.699|             4.825|[14.29,14.09,0.90...|
|13.84|    13.94|     0.8955|             5.324|3.3789999999999996|                2.259|             4.805|[13.84,13.94,0.89...|
|16.14|    14.99|     0.9034|5.6579999999999995|             3.562|                1.355| 

In [16]:
# Scale data
from pyspark.ml.feature import StandardScaler

In [23]:
scaler = StandardScaler(inputCol = 'features',outputCol = 'scaled_features')

In [28]:
scaler_model = scaler.fit(final_data)

In [29]:
final_data = scaler_model.transform(final_data)

In [30]:
final_data.show()

+-----+---------+-----------+------------------+------------------+---------------------+------------------+--------------------+--------------------+
| area|perimeter|compactness|  length_of_kernel|   width_of_kernel|asymmetry_coefficient|  length_of_groove|            features|     scaled_features|
+-----+---------+-----------+------------------+------------------+---------------------+------------------+--------------------+--------------------+
|15.26|    14.84|      0.871|             5.763|             3.312|                2.221|              5.22|[15.26,14.84,0.87...|[5.24452795332028...|
|14.88|    14.57|     0.8811| 5.553999999999999|             3.333|                1.018|             4.956|[14.88,14.57,0.88...|[5.11393027165175...|
|14.29|    14.09|      0.905|             5.291|3.3369999999999997|                2.699|             4.825|[14.29,14.09,0.90...|[4.91116018695588...|
|13.84|    13.94|     0.8955|             5.324|3.3789999999999996|                2.259|     

In [31]:
#Instantiate model -> increasing K = more clusters, wssse goes down
kmeans = KMeans(featuresCol = 'scaled_features').setK(3).setSeed(1)

In [32]:
model = kmeans.fit(final_data)

In [34]:
print('WSSSE is:')
print(model.computeCost(final_data))

WSSSE is:
428.6333432285446


In [35]:
# Get centers
centers = model.clusterCenters()
print(centers)

[array([ 6.35645488, 12.40730852, 37.41990178, 13.93860446,  9.7892399 ,
        2.41585013, 12.29286107]), array([ 4.07135818, 10.14438097, 35.86461803, 11.81349589,  7.53471695,
        3.18317127, 10.39230304]), array([ 4.94114963, 10.95557919, 37.3028184 , 12.42383591,  8.60815545,
        1.80983376, 10.40657797])]


In [37]:
# Transform data and make predictions
results = model.transform(final_data).select('prediction')
results.show()

+----------+
|prediction|
+----------+
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         0|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         1|
+----------+
only showing top 20 rows

