In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('seedfinder').getOrCreate()

### Explore Data

In [3]:
data = spark.read.csv('../data/seeds_dataset.csv', inferSchema=True, header=True)

In [4]:
data.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)



In [6]:
data.head(1)

[Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22)]

In [7]:
data.count()

210

In [8]:
# we know there are 3 different kinds of wheat, so we use K = 3

In [9]:
from pyspark.ml.clustering import KMeans

### Create Feature Set

In [11]:
from pyspark.ml.feature import VectorAssembler

In [12]:
assembler = VectorAssembler(inputCols=data.columns,outputCol='features')

In [15]:
with_features = assembler.transform(data).select('features')

### Feature Scaling

In [17]:
from pyspark.ml.feature import StandardScaler

In [18]:
scaler = StandardScaler(inputCol='features', outputCol='scaled_features')

In [20]:
scalar_model = scaler.fit(with_features)

In [21]:
scaled_data = scalar_model.transform(with_features)

In [23]:
scaled_data.select('scaled_features').head(1)

[Row(scaled_features=DenseVector([5.2445, 11.3633, 36.8608, 13.0072, 8.7685, 1.4772, 10.621]))]

### Train KMeans Model

In [25]:
kmeans = KMeans(featuresCol='scaled_features', k=3)
model = kmeans.fit(scaled_data)

### Interpret Cluster Results

In [26]:
model.computeCost(scaled_data)

428.60820118716356

In [27]:
model.clusterCenters()

[array([ 4.96198582, 10.97871333, 37.30930808, 12.44647267,  8.62880781,
         1.80061978, 10.41913733]),
 array([ 6.35645488, 12.40730852, 37.41990178, 13.93860446,  9.7892399 ,
         2.41585013, 12.29286107]),
 array([ 4.07497225, 10.14410142, 35.89816849, 11.80812742,  7.54416916,
         3.15410901, 10.38031464])]

In [28]:
results = model.transform(scaled_data)

In [31]:
results.select('prediction').show()

+----------+
|prediction|
+----------+
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         1|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         2|
+----------+
only showing top 20 rows

