### Spark ML k-meansによるクラスタリング

In [1]:
import pandas as pd
import numpy as np

from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, when, col
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

#### SparkSessionの作成

In [2]:
spark = SparkSession.builder.getOrCreate()

#### データの読み込み

In [3]:
data = spark.read.load('data/iris.csv',
                       format = 'csv',
                       sep = ',',
                       header = True,
                       inferSchema = True)

In [4]:
data.show()

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
|         5.4|        3.9|         1.7|        0.4| setosa|
|         4.6|        3.4|         1.4|        0.3| setosa|
|         5.0|        3.4|         1.5|        0.2| setosa|
|         4.4|        2.9|         1.4|        0.2| setosa|
|         4.9|        3.1|         1.5|        0.1| setosa|
|         5.4|        3.7|         1.5|        0.2| setosa|
|         4.8|        3.4|         1.6|        0.2| setosa|
|         4.8|        3.0|         1.4|        0.1| setosa|
|         4.3|        3.0|         1.1| 

In [5]:
data.dtypes

[('sepal_length', 'double'),
 ('sepal_width', 'double'),
 ('petal_length', 'double'),
 ('petal_width', 'double'),
 ('species', 'string')]

In [6]:
data.groupby('species').count().show()

+----------+-----+
|   species|count|
+----------+-----+
| virginica|   50|
|versicolor|   50|
|    setosa|   50|
+----------+-----+



#### ①特徴量のアッセンブル化ステージ

In [7]:
features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
assemble = VectorAssembler(inputCols = features, outputCol = 'features')

#### ②kmeansのインスタンス化ステージ

In [8]:
k = 3
kmeans = KMeans().setK(k)

#### パイプラインの登録（①～②）

In [9]:
pipeline = Pipeline(stages= [assemble, kmeans])

#### 学習

In [10]:
model = pipeline.fit(data)

#### 推論

In [11]:
pred = model.transform(data)
pred.show()

+------------+-----------+------------+-----------+-------+-----------------+----------+
|sepal_length|sepal_width|petal_length|petal_width|species|         features|prediction|
+------------+-----------+------------+-----------+-------+-----------------+----------+
|         5.1|        3.5|         1.4|        0.2| setosa|[5.1,3.5,1.4,0.2]|         1|
|         4.9|        3.0|         1.4|        0.2| setosa|[4.9,3.0,1.4,0.2]|         1|
|         4.7|        3.2|         1.3|        0.2| setosa|[4.7,3.2,1.3,0.2]|         1|
|         4.6|        3.1|         1.5|        0.2| setosa|[4.6,3.1,1.5,0.2]|         1|
|         5.0|        3.6|         1.4|        0.2| setosa|[5.0,3.6,1.4,0.2]|         1|
|         5.4|        3.9|         1.7|        0.4| setosa|[5.4,3.9,1.7,0.4]|         1|
|         4.6|        3.4|         1.4|        0.3| setosa|[4.6,3.4,1.4,0.3]|         1|
|         5.0|        3.4|         1.5|        0.2| setosa|[5.0,3.4,1.5,0.2]|         1|
|         4.4|       

#### 精度評価

In [12]:
pred.groupby(['species', 'prediction']).count().sort(['species', 'prediction']).show()

+----------+----------+-----+
|   species|prediction|count|
+----------+----------+-----+
|    setosa|         1|   50|
|versicolor|         0|    3|
|versicolor|         2|   47|
| virginica|         0|   36|
| virginica|         2|   14|
+----------+----------+-----+

