# Import libraries

In [ ]:
import geotrellis.raster._
import geotrellis.raster.vectorize._
import geotrellis.raster.vectorize
import geotrellis.raster.io.geotiff._
import geotrellis.raster.render._
import geotrellis.raster.io.geotiff.GeoTiff
import geotrellis.raster.resample._

import geotrellis.spark._
import geotrellis.spark.io._
import geotrellis.spark.io.RasterReader
import geotrellis.spark.io.hadoop._
import geotrellis.spark.io.hadoop
import geotrellis.spark.tiling._
import geotrellis.spark.tiling.FloatingLayoutScheme

import geotrellis.vector._

import org.apache.spark.SparkContext
import org.apache.spark.rdd._
import org.apache.spark.rdd.RDD
import org.apache.spark.HashPartitioner
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}

import java.net.URI
import scala.math.BigDecimal.RoundingMode
import org.apache.hadoop.fs.Path

import geotrellis.raster._
import geotrellis.raster.vectorize._
import geotrellis.raster.vectorize
import geotrellis.raster.io.geotiff._
import geotrellis.raster.render._
import geotrellis.raster.io.geotiff.GeoTiff
import geotrellis.raster.resample._
import geotrellis.spark._
import geotrellis.spark.io._
import geotrellis.spark.io.RasterReader
import geotrellis.spark.io.hadoop._
import geotrellis.spark.io.hadoop
import geotrellis.spark.tiling._
import geotrellis.spark.tiling.FloatingLayoutScheme
import geotrellis.vector._
import org.apache.spark.SparkContext
import org.apache.spark.rdd._
import org.apache.spark.rdd.RDD
import org.apache.spark.HashPartitioner
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
import java....

# Important variables

In [ ]:
val rr = implicitly[RasterReader[HadoopGeoTiffRDD.Options, (ProjectedExtent, Tile)]]
implicit val sparkContext = sc

rr: geotrellis.spark.io.RasterReader[geotrellis.spark.io.hadoop.HadoopGeoTiffRDD.Options,(geotrellis.vector.ProjectedExtent, geotrellis.raster.Tile)] = geotrellis.spark.io.RasterReader$$anon$1@3bc937c8
sparkContext: org.apache.spark.SparkContext = org.apache.spark.SparkContext@6e87c3eb


# Parameters

In [ ]:
val HdfsUrl = "hdfs://hupi-factory-02-01-01-01/"
val dataRepo = "user/factory02/thailand_workshop/data_multiBands/"
val saveRepo = "user/factory02/thailand_workshop/kmeans/"
val landsatName = "LC08_L1TP_125052_20171231_20180103_01_T1"
val k = 9

HdfsUrl: String = hdfs://hupi-factory-02-01-01-01/
dataRepo: String = user/factory02/thailand_workshop/data_multiBands/
saveRepo: String = user/factory02/thailand_workshop/kmeans/
landsatName: String = LC08_L1TP_125052_20171231_20180103_01_T1
k: Int = 9


In [ ]:
val saveAddress = HdfsUrl + saveRepo + landsatName + "/" + k
val saveAddressPmml = HdfsUrl + saveRepo + landsatName + "_PMML"
val savePath = new Path(saveAddress)
val savePathPmml = new Path(saveAddressPmml)

saveAddress: String = hdfs://hupi-factory-02-01-01-01/user/factory02/thailand_workshop/kmeans/LC08_L1TP_125052_20171231_20180103_01_T1/9
saveAddressPmml: String = hdfs://hupi-factory-02-01-01-01/user/factory02/thailand_workshop/kmeans/LC08_L1TP_125052_20171231_20180103_01_T1_PMML
savePath: org.apache.hadoop.fs.Path = hdfs://hupi-factory-02-01-01-01/user/factory02/thailand_workshop/kmeans/LC08_L1TP_125052_20171231_20180103_01_T1/9
savePathPmml: org.apache.hadoop.fs.Path = hdfs://hupi-factory-02-01-01-01/user/factory02/thailand_workshop/kmeans/LC08_L1TP_125052_20171231_20180103_01_T1_PMML


# To avoid duplicates in HDFS

In [ ]:
val conf = sc.hadoopConfiguration 
val fs = org.apache.hadoop.fs.FileSystem.get(new java.net.URI(HdfsUrl), conf)

conf: org.apache.hadoop.conf.Configuration = Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml, hdfs-site.xml
fs: org.apache.hadoop.fs.FileSystem = DFS[DFSClient[clientName=DFSClient_NONMAPREDUCE_1383697495_113, ugi=root (auth:SIMPLE)]]


In [ ]:
// delete the illustration if it existed already
fs.delete(savePath,true)

res15: Boolean = true


In [ ]:
// delete the pmml model if it existed already
fs.delete(savePathPmml,true)

# Load multiBandgeoTiff from HDFS

In [ ]:
val options =
HadoopGeoTiffRDD.Options(
  numPartitions = Some(100)
)

In [ ]:
// We get multi bands from HDFS (except band 8) 
val sourceTiles = sc.hadoopMultibandGeoTiffRDD(HdfsUrl + dataRepo + landsatName + ".tif").repartition(100)

sourceTiles: org.apache.spark.rdd.RDD[(geotrellis.vector.ProjectedExtent, geotrellis.raster.MultibandTile)] = MapPartitionsRDD[7] at repartition at <console>:120


# Prepare input for KMeans

To be able to run KMeans in Spark, we need to have RDD[Vector] as input

In [ ]:
// We convert multi tiles into one vector of features in RDD
val rddArrayOfTiles = sourceTiles.map (l => (l._2.band(0).convert(DoubleConstantNoDataCellType).toArrayDouble(), 
                       l._2.band(1).convert(DoubleConstantNoDataCellType).toArrayDouble(), 
                       l._2.band(2).convert(DoubleConstantNoDataCellType).toArrayDouble(),
                       l._2.band(3).convert(DoubleConstantNoDataCellType).toArrayDouble(), 
                       l._2.band(4).convert(DoubleConstantNoDataCellType).toArrayDouble(),
                       l._2.band(5).convert(DoubleConstantNoDataCellType).toArrayDouble(), 
                       l._2.band(6).convert(DoubleConstantNoDataCellType).toArrayDouble(),
                       l._2.band(7).convert(DoubleConstantNoDataCellType).toArrayDouble(),
                       l._2.band(8).convert(DoubleConstantNoDataCellType).toArrayDouble(), 
                       l._2.band(9).convert(DoubleConstantNoDataCellType).toArrayDouble()))
.map(l => l._1.zip(l._2).zip(l._3).zip(l._4).zip(l._5).zip(l._6).zip(l._7).zip(l._8).zip(l._9).zip(l._10))
.map(l => l.map(k => Vectors.dense(k._1._1._1._1._1._1._1._1._1, k._1._1._1._1._1._1._1._1._2, 
                                   k._1._1._1._1._1._1._1._2, k._1._1._1._1._1._1._2,
                                  k._1._1._1._1._1._2, k._1._1._1._1._2, k._1._1._1._2,
                                   k._1._1._2, k._1._2, k._2)))

rddArrayOfTiles: org.apache.spark.rdd.RDD[Array[org.apache.spark.mllib.linalg.Vector]] = MapPartitionsRDD[10] at map at <console>:134


In [ ]:
val input = rddArrayOfTiles.flatMap(l => l)

input: org.apache.spark.rdd.RDD[org.apache.spark.mllib.linalg.Vector] = MapPartitionsRDD[11] at flatMap at <console>:124


# Build a KMeans model

In [ ]:
// We train KMeans model
val clusters = KMeans.train(input, k, 20)

clusters: org.apache.spark.mllib.clustering.KMeansModel = org.apache.spark.mllib.clustering.KMeansModel@544f835f


# Save model to HDFS

In [ ]:
/*
// Save model
clusters.save(sc, saveAddress)
*/

# Save model in PMML format

In [ ]:
// Export to PMML to a String in PMML format
println("PMML Model:\n" + clusters.toPMML)

PMML Model:
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<PMML version="4.2" xmlns="http://www.dmg.org/PMML-4_2">
    <Header description="k-means clustering">
        <Application name="Apache Spark MLlib" version="2.1.1"/>
        <Timestamp>2018-01-22T20:42:15</Timestamp>
    </Header>
    <DataDictionary numberOfFields="10">
        <DataField name="field_0" optype="continuous" dataType="double"/>
        <DataField name="field_1" optype="continuous" dataType="double"/>
        <DataField name="field_2" optype="continuous" dataType="double"/>
        <DataField name="field_3" optype="continuous" dataType="double"/>
        <DataField name="field_4" optype="continuous" dataType="double"/>
        <DataField name="field_5" optype="continuous" dataType="double"/>
        <DataField name="field_6" optype="continuous" dataType="double"/>
        <DataField name="field_7" optype="continuous" dataType="double"/>
        <DataField name="field_8" optype="continuous" dataType="do

In [ ]:
// Export the model to a directory on a distributed file system in PMML format
clusters.toPMML(sc, saveAddressPmml)