# Importer les librairies

In [ ]:
import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
import org.apache.spark.mllib.linalg.Vectors

import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
import org.apache.spark.mllib.linalg.Vectors


# Lire les données dans HDFS

In [ ]:
// On lit le fichier products.csv dans HDFS
val data = sparkSession.read.format("csv")
.option("header", "true")
.option("inferSchema", "true")
.option("delimiter", ",")
.csv("hdfs://ecoles.node1.pro.hupi.loc:8020/user/ecoles/kmeans/iris.csv")

data: org.apache.spark.sql.DataFrame = [sepal_length: double, sepal_width: double ... 3 more fields]


In [ ]:
data.printSchema()

root
 |-- sepal_length: double (nullable = true)
 |-- sepal_width: double (nullable = true)
 |-- petal_length: double (nullable = true)
 |-- petal_width: double (nullable = true)
 |-- species: string (nullable = true)



# Préparer input pour créer le modèle

In [ ]:
// Convertir en rdd (car il faut un RDD comme input quand on crée un KMeans model dans MLLib)
val rdd_data = data.map {
  case org.apache.spark.sql.Row (sepalLenght, sepalWidth, petalLength, petalWidth, species) => {
    (sepalLenght.asInstanceOf[Double], sepalWidth.asInstanceOf[Double],
     petalLength.asInstanceOf[Double], petalWidth.asInstanceOf[Double],
     species.asInstanceOf[String])
  }
}.rdd

rdd_data: org.apache.spark.rdd.RDD[(Double, Double, Double, Double, String)] = MapPartitionsRDD[18] at rdd at <console>:77


In [ ]:
rdd_data.take(10)

res11: Array[(Double, Double, Double, Double, String)] = Array((5.1,3.5,1.4,0.2,setosa), (4.9,3.0,1.4,0.2,setosa), (4.7,3.2,1.3,0.2,setosa), (4.6,3.1,1.5,0.2,setosa), (5.0,3.6,1.4,0.2,setosa), (5.4,3.9,1.7,0.4,setosa), (4.6,3.4,1.4,0.3,setosa), (5.0,3.4,1.5,0.2,setosa), (4.4,2.9,1.4,0.2,setosa), (4.9,3.1,1.5,0.1,setosa))


In [ ]:
// On ne sélectionne que les features
val input = rdd_data.map(s => Vectors.dense(s._1, s._2, s._3, s._4)).cache()

input: org.apache.spark.rdd.RDD[org.apache.spark.mllib.linalg.Vector] = MapPartitionsRDD[19] at map at <console>:74


# Créer un modèle KMeans

In [ ]:
// Cluster the data into two classes using KMeans
val numClusters = 3 
val numIterations = 20
val clusters = KMeans.train(input, numClusters, numIterations)

numClusters: Int = 3
numIterations: Int = 20
clusters: org.apache.spark.mllib.clustering.KMeansModel = org.apache.spark.mllib.clustering.KMeansModel@270cc3da


In [ ]:
// Evaluate clustering by computing Within Set Sum of Squared Errors
val WSSSE = clusters.computeCost(input)
println("Within Set Sum of Squared Errors = " + WSSSE)

/*
// Pour enregistrer en HDFS et recharge le modèle
clusters.save(sc, "target/org/apache/spark/KMeansExample/KMeansModel")
val sameModel = KMeansModel.load(sc, "target/org/apache/spark/KMeansExample/KMeansModel")
*/

Within Set Sum of Squared Errors = 78.94506582597637
WSSSE: Double = 78.94506582597637


# Convertir en PMML

In [ ]:
// Pour imprimer le modèle en PMML (ce qu'il faut copier-coller pour créer endpoint predict)
println("PMML Model:\n" + clusters.toPMML)

PMML Model:
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<PMML version="4.2" xmlns="http://www.dmg.org/PMML-4_2">
    <Header description="k-means clustering">
        <Application name="Apache Spark MLlib" version="2.1.1"/>
        <Timestamp>2018-01-29T16:36:16</Timestamp>
    </Header>
    <DataDictionary numberOfFields="4">
        <DataField name="field_0" optype="continuous" dataType="double"/>
        <DataField name="field_1" optype="continuous" dataType="double"/>
        <DataField name="field_2" optype="continuous" dataType="double"/>
        <DataField name="field_3" optype="continuous" dataType="double"/>
    </DataDictionary>
    <ClusteringModel modelName="k-means" functionName="clustering" modelClass="centerBased" numberOfClusters="3">
        <MiningSchema>
            <MiningField name="field_0" usageType="active"/>
            <MiningField name="field_1" usageType="active"/>
            <MiningField name="field_2" usageType="active"/>
            <MiningFi

In [ ]:
/*
// Export the model to a local file in PMML format
clusters.toPMML("/tmp/kmeans.xml")

// Export the model to a directory on a distributed file system in PMML format
clusters.toPMML(sc, "/tmp/kmeans")
*/