In [1]:
import findspark 
from pyspark.ml.clustering import KMeans 
from pyspark.ml.feature import VectorAssembler 
from pyspark.sql import  SparkSession 

In [2]:
findspark.init()

In [3]:
spark = SparkSession.builder.appName("Clustering using SparkML").getOrCreate()

In [5]:
customer_data = spark.read.csv('customers.csv', header = True, inferSchema =True)

In [6]:
customer_data.show(n=5, truncate=False)

+----------+----+-------+-----------+
|Fresh_Food|Milk|Grocery|Frozen_Food|
+----------+----+-------+-----------+
|12669     |9656|7561   |214        |
|7057      |9810|9568   |1762       |
|6353      |8808|7684   |2405       |
|13265     |1196|4221   |6404       |
|22615     |5410|7198   |3915       |
+----------+----+-------+-----------+
only showing top 5 rows



In [7]:
customer_data.printSchema()

root
 |-- Fresh_Food: integer (nullable = true)
 |-- Milk: integer (nullable = true)
 |-- Grocery: integer (nullable = true)
 |-- Frozen_Food: integer (nullable = true)



In [8]:
feature_cols = ['Fresh_Food', 'Milk', 'Grocery', 'Frozen_Food']
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')
customer_transformed_data = assembler.transform(customer_data)

In [9]:
number_of_clusters = 3

In [10]:
kmeans = KMeans(k = number_of_clusters)
model = kmeans.fit(customer_transformed_data)

In [11]:
predictions = model.transform(customer_transformed_data)

In [12]:
predictions.show()

+----------+-----+-------+-----------+--------------------+----------+
|Fresh_Food| Milk|Grocery|Frozen_Food|            features|prediction|
+----------+-----+-------+-----------+--------------------+----------+
|     12669| 9656|   7561|        214|[12669.0,9656.0,7...|         1|
|      7057| 9810|   9568|       1762|[7057.0,9810.0,95...|         1|
|      6353| 8808|   7684|       2405|[6353.0,8808.0,76...|         1|
|     13265| 1196|   4221|       6404|[13265.0,1196.0,4...|         1|
|     22615| 5410|   7198|       3915|[22615.0,5410.0,7...|         0|
|      9413| 8259|   5126|        666|[9413.0,8259.0,51...|         1|
|     12126| 3199|   6975|        480|[12126.0,3199.0,6...|         1|
|      7579| 4956|   9426|       1669|[7579.0,4956.0,94...|         1|
|      5963| 3648|   6192|        425|[5963.0,3648.0,61...|         1|
|      6006|11093|  18881|       1159|[6006.0,11093.0,1...|         2|
|      3366| 5403|  12974|       4400|[3366.0,5403.0,12...|         1|
|     

In [13]:
spark.stop()

In [28]:
spark1 = SparkSession.builder.appName("Seed Clustering").getOrCreate()

In [29]:
seed_data = spark1.read.csv("seeds.csv", header = True, inferSchema=True)

In [30]:
seed_data.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length of kernel: double (nullable = true)
 |-- width of kernel: double (nullable = true)
 |-- asymmetry coefficient: double (nullable = true)
 |-- length of kernel groove: double (nullable = true)



In [31]:
seed_data.show(n=5, truncate  = False)

+-----+---------+-----------+----------------+---------------+---------------------+-----------------------+
|area |perimeter|compactness|length of kernel|width of kernel|asymmetry coefficient|length of kernel groove|
+-----+---------+-----------+----------------+---------------+---------------------+-----------------------+
|15.26|14.84    |0.871      |5.763           |3.312          |2.221                |5.22                   |
|14.88|14.57    |0.8811     |5.554           |3.333          |1.018                |4.956                  |
|14.29|14.09    |0.905      |5.291           |3.337          |2.699                |4.825                  |
|13.84|13.94    |0.8955     |5.324           |3.379          |2.259                |4.805                  |
|16.14|14.99    |0.9034     |5.658           |3.562          |1.355                |5.175                  |
+-----+---------+-----------+----------------+---------------+---------------------+-----------------------+
only showing top 5 

In [35]:
feature_cols = ['area', 'perimeter', 'compactness', 'length of kernel', 'width of kernel', 'asymmetry coefficient', 'length of kernel groove']
assembler = VectorAssembler(inputCols = feature_cols, outputCol = 'features')
seed_transformed_data = assembler.transform(seed_data)

In [36]:
number_of_clusters = 7
kmeans = KMeans(k =  number_of_clusters)
model = kmeans.fit(seed_transformed_data)

In [39]:
predictions = model.transform(seed_transformed_data)

In [40]:
predictions.show(n = 5, truncate = False, vertical = True)

-RECORD 0---------------------------------------------------------------
 area                    | 15.26                                        
 perimeter               | 14.84                                        
 compactness             | 0.871                                        
 length of kernel        | 5.763                                        
 width of kernel         | 3.312                                        
 asymmetry coefficient   | 2.221                                        
 length of kernel groove | 5.22                                         
 features                | [15.26,14.84,0.871,5.763,3.312,2.221,5.22]   
 prediction              | 2                                            
-RECORD 1---------------------------------------------------------------
 area                    | 14.88                                        
 perimeter               | 14.57                                        
 compactness             | 0.8811                  

In [41]:
predictions.groupby('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|   23|
|         6|   19|
|         3|   44|
|         5|   44|
|         4|   32|
|         2|   24|
|         0|   24|
+----------+-----+



In [42]:
spark1.stop()