#### Dataset: data3d.csv
#### Requirement:
* Read dataset
* Pre-processing data
* Use K-means clustering algorithm to cluster 3D points in data3D.csv

In [None]:
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [None]:
import findspark
findspark.init()

In [None]:
import pyspark
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from mpl_toolkits.mplot3s import Axes3D

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, monotonically_increasing_id

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler, StandardScaler

from pyspark.ml.clustering import KMeans

In [None]:
spark = SparkSession.builder.appName('kmeans_3D_point').getOrCreate()

In [None]:
# Load data
data = spark.read.csv("../../Data/data3D.csv", header=True, inferSchema=True)

In [None]:
data = data.select(['x', 'y', 'z'])

In [None]:
data.show(3)

### Format from data

In [None]:
vec_assembler = VectorAssembler(inputCols=data.columns, outputCol='features')

In [None]:
final_data = vec_assembler.transform(data)

### Scale the Data

In [None]:
scaler = StanderdScaler(inputCol='features', outputCol='scaledFeatures', withStd=True, withMean=False)

In [None]:
# Compute summary statistics by fitting the StandardScaler
scalerModel = scaler.fit(final_data)

In [None]:
# Normalize each feature to have unit standard deviation
final_data = scalerModel.transform(final_data)

In [None]:
final_data.show(3, False)

### Train the Model and Evaluate

In [None]:
# Select k with minimun WSSSE: k between 2-10
k_list = []
wssse_list = []
for k in range(2, 11):
    kmeans = KMeans(featuresCol='scaledFeatures', k=k)
    model = kmeans.fit(final_data)
    wssse = model.computeCost(final_data)
    k_list.append(k)
    wssse_list.append(wssse)
    print('With k=', k, "Set Sum of Squared Errors=", str(wssse))

In [None]:
plt.plot(k_list, wssse_list)
plt.show()

Chọn k=5

In [22]:
# Train a k-means model
kmeans = KMeans(featuresCol='scaledFeatures', k=5)
model = kmeans.fit(final_data)

In [23]:
# Evaluate clustering by computing within Set Sum of Squared Errors
wssse = model.summary.trainingCost
print("With Set Sum of Squared Errors=", str(wssse))

With Set Sum of Squared Errors= 912.8886701353701


In [24]:
# Show the result
centers = model.clusterCenters()
print('Cluster Centers:')
for center in centers:
    print(center)

Cluster Centers:
[2.76960693 3.63128545]
[3.38398554 1.00918242]
[3.43256983 2.66251377]
[0.96755054 2.36754019]
[1.70471862 0.71909661]
[0.75463741 1.48020112]
[1.33679274 3.4704405 ]
[2.21696554 1.92807149]


In [25]:
predictions = model.transform(final_data)

In [26]:
predictions.select('prediction').show(5)

+----------+
|prediction|
+----------+
|         7|
|         7|
|         7|
|         7|
|         7|
+----------+
only showing top 5 rows



In [27]:
predictions.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  643|
|         6|  652|
|         3|  678|
|         5|  373|
|         4|  691|
|         7|  918|
|         2|  663|
|         0|  382|
+----------+-----+



In [None]:
final_data.show(3, False)

In [None]:
temp = final_data.select("scaledFeatures").rdd.map(lambda x: x[0].toArray().tolist()).toDF()

In [None]:
temp.show(3)

In [None]:
temp = temp.withColumn('row_index', monotonically_increasing_id())
data_result = data_result.withColumn('row_index', monotonically_increasing_id())
temp = temp.join(data_result, on=["row_index"]).sort("row_index").drop("row_index")
temp.show(3)

In [34]:
temp = temp.select(col("_1").alias("x_scale"), col("_2").alias("y_scale"), "prediction")
df = temp.toPandas()

In [35]:
centers_df = pd.DataFrame(centers)
centers_df.head()

Unnamed: 0,0,1
0,2.769607,3.631285
1,3.383986,1.009182
2,3.43257,2.662514
3,0.967551,2.36754
4,1.704719,0.719097


In [None]:
threedee = plt.figure(figsize=(12, 10)).gca(projection='3d')
threedee.scatter(df.x_scale, df.y_scale, df.z_scale, c=df.prediction)
threedee.set_xlabel('x')
threedee.set_ylabel('y')
threedee.set_zlabel('z')

plt.show()

### Combine results

In [None]:
final_data = final_data.withColumn('row_index', monotonically_increasing_id())
temp = temp.withColumn('row_index', monotonically_increasing_id())
final_data = final_data.join(temp, on=["row_index"]).sort("row_index").drop("row_index")
temp.show(3)

In [None]:
final_data.show(3, False)