In this notebook, we will choose the optimal k with Elbow method

source : https://en.wikipedia.org/wiki/Determining_the_number_of_clusters_in_a_data_set

"The elbow method looks at the percentage of variance explained as a function of the number of clusters: One should choose a number of clusters so that adding another cluster doesn't give much better modeling of the data. More precisely, if one plots the percentage of variance explained by the clusters against the number of clusters, the first clusters will add much information (explain a lot of variance), but at some point the marginal gain will drop, giving an angle in the graph. The number of clusters is chosen at this point, hence the "elbow criterion". This "elbow" cannot always be unambiguously identified.[1] Percentage of variance explained is the ratio of the between-group variance to the total variance, also known as an F-test. A slight variation of this method plots the curvature of the within group variance."

In [ ]:
import geotrellis.raster._
import geotrellis.raster.vectorize._
import geotrellis.raster.vectorize
import geotrellis.raster.io.geotiff._
import geotrellis.raster.render._
import geotrellis.raster.io.geotiff.GeoTiff
import geotrellis.raster.resample._

import geotrellis.spark._
import geotrellis.spark.io._
import geotrellis.spark.io.RasterReader
import geotrellis.spark.io.hadoop._
import geotrellis.spark.io.hadoop
import geotrellis.spark.tiling._
import geotrellis.spark.tiling.FloatingLayoutScheme

import geotrellis.vector._

import org.apache.spark.SparkContext
import org.apache.spark.rdd._
import org.apache.spark.rdd.RDD
import org.apache.spark.HashPartitioner
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}

import java.net.URI
import scala.math.BigDecimal.RoundingMode
import org.apache.hadoop.fs.Path
import scala.collection.mutable.ListBuffer

import geotrellis.raster._
import geotrellis.raster.vectorize._
import geotrellis.raster.vectorize
import geotrellis.raster.io.geotiff._
import geotrellis.raster.render._
import geotrellis.raster.io.geotiff.GeoTiff
import geotrellis.raster.resample._
import geotrellis.spark._
import geotrellis.spark.io._
import geotrellis.spark.io.RasterReader
import geotrellis.spark.io.hadoop._
import geotrellis.spark.io.hadoop
import geotrellis.spark.tiling._
import geotrellis.spark.tiling.FloatingLayoutScheme
import geotrellis.vector._
import org.apache.spark.SparkContext
import org.apache.spark.rdd._
import org.apache.spark.rdd.RDD
import org.apache.spark.HashPartitioner
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
import java....

# Parameters

In [ ]:
val HdfsUrl = "hdfs://hupi-factory-02-01-01-01/"
val dataRepo1 = "user/factory02/thailand_workshop/data_multiBands/"
val dataRepo2 = "user/factory02/thailand_workshop/kmeans/"
val landsatName = "LC08_L1TP_125052_20171231_20180103_01_T1"

HdfsUrl: String = hdfs://hupi-factory-02-01-01-01/
dataRepo1: String = user/factory02/thailand_workshop/data_multiBands/
dataRepo2: String = user/factory02/thailand_workshop/kmeans/
landsatName: String = LC08_L1TP_125052_20171231_20180103_01_T1


# Load GeoTiff from HDFS and create the dataset for Kmeans

In [ ]:
// We get multi bands from HDFS (except band 8) 
val sourceTiles = sc.hadoopMultibandGeoTiffRDD(HdfsUrl + dataRepo1 + landsatName + ".tif").repartition(100)

sourceTiles: org.apache.spark.rdd.RDD[(geotrellis.vector.ProjectedExtent, geotrellis.raster.MultibandTile)] = MapPartitionsRDD[7] at repartition at <console>:119


In [ ]:
// We convert multi tiles into one vector of features in RDD
val input = sourceTiles.map (l => (l._2.band(0).convert(DoubleConstantNoDataCellType).toArrayDouble(), 
                       l._2.band(1).convert(DoubleConstantNoDataCellType).toArrayDouble(), 
                       l._2.band(2).convert(DoubleConstantNoDataCellType).toArrayDouble(),
                       l._2.band(3).convert(DoubleConstantNoDataCellType).toArrayDouble(), 
                       l._2.band(4).convert(DoubleConstantNoDataCellType).toArrayDouble(),
                       l._2.band(5).convert(DoubleConstantNoDataCellType).toArrayDouble(), 
                       l._2.band(6).convert(DoubleConstantNoDataCellType).toArrayDouble(),
                       l._2.band(7).convert(DoubleConstantNoDataCellType).toArrayDouble(),
                       l._2.band(8).convert(DoubleConstantNoDataCellType).toArrayDouble(), 
                       l._2.band(9).convert(DoubleConstantNoDataCellType).toArrayDouble()))
.map(l => l._1.zip(l._2).zip(l._3).zip(l._4).zip(l._5).zip(l._6).zip(l._7).zip(l._8).zip(l._9).zip(l._10))
.map(l => l.map(k => Vectors.dense(k._1._1._1._1._1._1._1._1._1, k._1._1._1._1._1._1._1._1._2, 
                                   k._1._1._1._1._1._1._1._2, k._1._1._1._1._1._1._2,
                                  k._1._1._1._1._1._2, k._1._1._1._1._2, k._1._1._1._2,
                                   k._1._1._2, k._1._2, k._2))).flatMap(l => l)

input: org.apache.spark.rdd.RDD[org.apache.spark.mllib.linalg.Vector] = MapPartitionsRDD[11] at flatMap at <console>:135


# List of all Kmeans model in HDFS

In [ ]:
val conf = sc.hadoopConfiguration 
val fs = org.apache.hadoop.fs.FileSystem.get(new java.net.URI(HdfsUrl), conf)

conf: org.apache.hadoop.conf.Configuration = Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml, hdfs-site.xml
fs: org.apache.hadoop.fs.FileSystem = DFS[DFSClient[clientName=DFSClient_NONMAPREDUCE_1855386348_12, ugi=root (auth:SIMPLE)]]


In [ ]:
val status = fs.listStatus(new Path(HdfsUrl + dataRepo2 + landsatName))
val listPaths = status.map(l => l.getPath.toString).filter(l => !l.contains(".png"))

status: Array[org.apache.hadoop.fs.FileStatus] = Array(FileStatus{path=hdfs://hupi-factory-02-01-01-01/user/factory02/thailand_workshop/kmeans/LC08_L1TP_125052_20171231_20180103_01_T1/10; isDirectory=true; modification_time=1516034180793; access_time=0; owner=factory02; group=supergroup; permission=rwxrwxrwx; isSymlink=false}, FileStatus{path=hdfs://hupi-factory-02-01-01-01/user/factory02/thailand_workshop/kmeans/LC08_L1TP_125052_20171231_20180103_01_T1/11; isDirectory=true; modification_time=1516034817073; access_time=0; owner=factory02; group=supergroup; permission=rwxrwxrwx; isSymlink=false}, FileStatus{path=hdfs://hupi-factory-02-01-01-01/user/factory02/thailand_workshop/kmeans/LC08_L1TP_125052_20171231_20180103_01_T1/12; isDirectory=true; modification_time=1516035467422; access_tim...

In [ ]:
val numberModels = listPaths.length
val numClusters = new ListBuffer[String] 
val WSSSE = new ListBuffer[Double]

numberModels: Int = 28
numClusters: scala.collection.mutable.ListBuffer[String] = ListBuffer()
WSSSE: scala.collection.mutable.ListBuffer[Double] = ListBuffer()


In [ ]:
// We do the loop that compute all WSSSE for each Kmeans model
for (i <- 0 to (numberModels-1)) {
  val k = listPaths(i).split("/").last
  val clusters = KMeansModel.load(sc, listPaths(i))
  // Evaluate clustering by computing Within Set Sum of Squared Errors
  val w = clusters.computeCost(input)
  numClusters += k
  WSSSE += w
}

In [ ]:
// Create the coordinates x corresponding to k and y as WSSSE
val x = numClusters.toList
val y = WSSSE.toList

x: List[String] = List(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 3, 30, 4, 5, 6, 7, 8, 9)
y: List[Double] = List(4.384089928208217E14, 4.1842139728097406E14, 3.9086565861284856E14, 3.742452438620692E14, 3.53610743052162E14, 3.324549553331858E14, 3.224136454981789E14, 3.085211016961698E14, 3.0407603247552706E14, 2.882359468016868E14, 2.7660116444340216E14, 2.6765306530376616E14, 2.819800538417536E14, 2.614258714649371E14, 2.4379860183863322E14, 2.541478525077957E14, 2.3429985771893962E14, 2.3065466626481803E14, 2.2013649328224206E14, 2.1475773367964694E14, 1.5196548536124038E15, 2.1482426800166603E14, 1.2697004421969332E15, 8.643314888141962E14, 6.816967102565431E14, 6.082935922800556E14, 5.2728254483075056E14, 4.751226426587388E14)


In [ ]:
val wsssePerK = x.zip(y).sortBy(_._1).map(l => (l._1.toInt, l._2))

wsssePerK: List[(Int, Double)] = List((10,4.384089928208217E14), (11,4.1842139728097406E14), (12,3.9086565861284856E14), (13,3.742452438620692E14), (14,3.53610743052162E14), (15,3.324549553331858E14), (16,3.224136454981789E14), (17,3.085211016961698E14), (18,3.0407603247552706E14), (19,2.882359468016868E14), (20,2.7660116444340216E14), (21,2.6765306530376616E14), (22,2.819800538417536E14), (23,2.614258714649371E14), (24,2.4379860183863322E14), (25,2.541478525077957E14), (26,2.3429985771893962E14), (27,2.3065466626481803E14), (28,2.2013649328224206E14), (29,2.1475773367964694E14), (3,1.5196548536124038E15), (30,2.1482426800166603E14), (4,1.2697004421969332E15), (5,8.643314888141962E14), (6,6.816967102565431E14), (7,6.082935922800556E14), (8,5.2728254483075056E14), (9,4.751226426587388E14))


# Save results to MongoDB

In [ ]:
// Convert to df
val df = sc.parallelize(wsssePerK).toDF("k", "wssse")

df: org.apache.spark.sql.DataFrame = [k: int, wssse: double]


In [ ]:
// We save the collection to Mongo by overwriting it
df.write.format("com.mongodb.spark.sql").option("uri", s"mongodb://10.100.2.7:27017/hupi.elbowMethodForKmeans")
.mode("overwrite").save()

# VIsualizations of results in notebook

In [ ]:
wsssePerK.sortBy(_._1)

res15: List[(Int, Double)] = List((3,1.5196548536124038E15), (4,1.2697004421969332E15), (5,8.643314888141962E14), (6,6.816967102565431E14), (7,6.082935922800556E14), (8,5.2728254483075056E14), (9,4.751226426587388E14), (10,4.384089928208217E14), (11,4.1842139728097406E14), (12,3.9086565861284856E14), (13,3.742452438620692E14), (14,3.53610743052162E14), (15,3.324549553331858E14), (16,3.224136454981789E14), (17,3.085211016961698E14), (18,3.0407603247552706E14), (19,2.882359468016868E14), (20,2.7660116444340216E14), (21,2.6765306530376616E14), (22,2.819800538417536E14), (23,2.614258714649371E14), (24,2.4379860183863322E14), (25,2.541478525077957E14), (26,2.3429985771893962E14), (27,2.3065466626481803E14), (28,2.2013649328224206E14), (29,2.1475773367964694E14), (30,2.1482426800166603E14))
