# Useful links to read :

http://geotrellis.readthedocs.io/en/latest/guide/core-concepts.html

Github :
https://github.com/locationtech/geotrellis

Function :
http://geotrellis.github.io/scaladocs/latest/#geotrellis.package

# Import libraries

In [ ]:
import geotrellis.proj4._

import geotrellis.spark._
import geotrellis.spark.io.hadoop._
import geotrellis.spark.io.hadoop.formats._
import geotrellis.spark.io.RasterReader
import geotrellis.spark.tiling.FloatingLayoutScheme

import geotrellis.vector._

import geotrellis.raster._
import geotrellis.raster.render._
import geotrellis.raster.io.geotiff._
import geotrellis.raster.io.geotiff.reader.GeoTiffReader
import geotrellis.raster.io.geotiff.tags.TiffTags
import geotrellis.raster.io.geotiff.GeoTiff

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path

import geotrellis.proj4._
import geotrellis.spark._
import geotrellis.spark.io.hadoop._
import geotrellis.spark.io.hadoop.formats._
import geotrellis.spark.io.RasterReader
import geotrellis.spark.tiling.FloatingLayoutScheme
import geotrellis.vector._
import geotrellis.raster._
import geotrellis.raster.render._
import geotrellis.raster.io.geotiff._
import geotrellis.raster.io.geotiff.reader.GeoTiffReader
import geotrellis.raster.io.geotiff.tags.TiffTags
import geotrellis.raster.io.geotiff.GeoTiff
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path


# Important variables (to run everytime we use Geotrellis)

In [ ]:
// implicit variable (important variables to run the functions in the geotrellis library)
implicit val sparkContext = sc

val rr = implicitly[RasterReader[HadoopGeoTiffRDD.Options, (ProjectedExtent, Tile)]]

sparkContext: org.apache.spark.SparkContext = org.apache.spark.SparkContext@3162bb66
rr: geotrellis.spark.io.RasterReader[geotrellis.spark.io.hadoop.HadoopGeoTiffRDD.Options,(geotrellis.vector.ProjectedExtent, geotrellis.raster.Tile)] = geotrellis.spark.io.RasterReader$$anon$1@7b3e0a76


# Parameters

In [ ]:
val HdfsUrl = "hdfs://hupi-factory-02-01-01-01/"
val dataRepo = "user/factory02/thailand_workshop/data_usgs/"
// val dataRepo = "user/luuthang211/vnu/"

// Landsat8 
val bandPath = "LC08_L1TP_125052_20171231_20180103_01_T1"
// val bandPath = "IMAGERY.TIF"

HdfsUrl: String = hdfs://hupi-factory-02-01-01-01/
dataRepo: String = user/factory02/thailand_workshop/data_usgs/
bandPath: String = LC08_L1TP_125052_20171231_20180103_01_T1


# Load GeoTiff from HDFS

When we read GeoTiff from HDFS, we will have a RDD[ProjectedExtent, Tile]

From the documents of Geotrellis, some important concepts to know beforehand :

- Tile is "A grid of numeric cells that represent some data on the Earth" 
- Cell is "A single unit of data in some grid, also called a Location in GIS"
- In ProjectedExtent, we have extent and CRS. For definition, "Extent: or "Bounding Box" represents some area on the Earth".

Here we have 2 ways to read, it's just 2 different ways to do the repartition

In [ ]:
// To make the notebook run more efficiently 
val options =
HadoopGeoTiffRDD.Options(
  numPartitions = Some(100)
)

options: geotrellis.spark.io.hadoop.HadoopGeoTiffRDD.Options = Options(List(.tif, .TIF, .tiff, .TIFF),None,TIFFTAG_DATETIME,yyyy:MM:dd HH:mm:ss,Some(256),Some(100),Some(134217728),None)


In [ ]:
// First option
val band = HadoopGeoTiffRDD[ProjectedExtent, Tile](
      new Path(HdfsUrl + dataRepo + bandPath + "/" + bandPath + "_B1.TIF"), 
      options).map(l => (l._1, l._2.convert(DoubleConstantNoDataCellType)))

// val band = HadoopGeoTiffRDD[ProjectedExtent, Tile](
//       new Path(HdfsUrl + dataRepo + bandPath), 
//       options).map(l => (l._1, l._2.convert(DoubleConstantNoDataCellType)))

org.apache.spark.SparkException: Job aborted due to stage failure: Task 3 in stage 0.0 failed 1 times, most recent failure: Lost task 3.0 in stage 0.0 (TID 3, localhost, executor driver): java.io.IOException: No space left on device
	at java.io.FileOutputStream.writeBytes(Native Method)
	at java.io.FileOutputStream.write(FileOutputStream.java:326)
	at org.apache.spark.util.Utils$$anonfun$copyStream$1.apply$mcJ$sp(Utils.scala:356)
	at org.apache.spark.util.Utils$$anonfun$copyStream$1.apply(Utils.scala:322)
	at org.apache.spark.util.Utils$$anonfun$copyStream$1.apply(Utils.scala:322)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1303)
	at org.apache.spark.util.Utils$.copyStream(Utils.scala:362)
	at org.apache.spark.util.Utils$.downloadFile(Utils.scala:509)
	at org.apache.spark.util.Utils$.doFetchFile(Utils.scala:639)
	at org.apache.spark.util.Utils$.fetchFile(Utils.scala:463)
	at org.apache.spark.executor.Executor$$anonfun$org$apache$spark$executor$Executor$$updateDepend

In [ ]:
// Second option
val band = sc.hadoopGeoTiffRDD(HdfsUrl + dataRepo + bandPath + "/" + bandPath + "_B1.TIF")
        .map(l => (l._1, l._2.convert(DoubleConstantNoDataCellType))).repartition(100)

band: org.apache.spark.rdd.RDD[(geotrellis.vector.ProjectedExtent, geotrellis.raster.Tile)] = MapPartitionsRDD[13] at repartition at <console>:106


# Inside RDD[ProjectedExtent, Tile]..

In [ ]:
// This version of Spark divides one GeoTiff into multiples Tiles (SingleBand or MultiBand)
// So if we count the number of observations in RDD, we will have number of Tiles
println("Number of Tiles in this band is " + band.count())

Number of Tiles in this band is 930


In [ ]:
// Number of Cells
println("Number of Cells in this band is " + band.map(l => l._2.toArrayDouble()).flatMap(l => l).count())

Number of Cells in this band is 58378481


In [ ]:
// If we take some observations
band.take(5)

<console>:70: error: not found: value band
       band.take(5)
       ^


In [ ]:
// ProjectedExtent
band.map(l => l._1).take(10)

res13: Array[geotrellis.vector.ProjectedExtent] = Array(ProjectedExtent(Extent(551625.0, 1379055.0, 559305.0, 1386735.0),EPSG:32648), ProjectedExtent(Extent(628425.0, 1348335.0, 636105.0, 1356015.0),EPSG:32648), ProjectedExtent(Extent(705225.0, 1325295.0, 712905.0, 1332975.0),EPSG:32648), ProjectedExtent(Extent(551625.0, 1286895.0, 559305.0, 1294575.0),EPSG:32648), ProjectedExtent(Extent(628425.0, 1279215.0, 636105.0, 1286895.0),EPSG:32648), ProjectedExtent(Extent(705225.0, 1202415.0, 712905.0, 1210095.0),EPSG:32648), ProjectedExtent(Extent(551625.0, 1256175.0, 559305.0, 1263855.0),EPSG:32648), ProjectedExtent(Extent(628425.0, 1194735.0, 636105.0, 1202415.0),EPSG:32648), ProjectedExtent(Extent(705225.0, 1187055.0, 712905.0, 1194735.0),EPSG:32648), ProjectedExtent(Extent(559305.0, 137905...

In [ ]:
// In extent, we can see details 
band.map(l => l._1.extent).take(10)

res15: Array[geotrellis.vector.Extent] = Array(Extent(551625.0, 1379055.0, 559305.0, 1386735.0), Extent(628425.0, 1348335.0, 636105.0, 1356015.0), Extent(705225.0, 1325295.0, 712905.0, 1332975.0), Extent(551625.0, 1286895.0, 559305.0, 1294575.0), Extent(628425.0, 1279215.0, 636105.0, 1286895.0), Extent(705225.0, 1202415.0, 712905.0, 1210095.0), Extent(551625.0, 1256175.0, 559305.0, 1263855.0), Extent(628425.0, 1194735.0, 636105.0, 1202415.0), Extent(705225.0, 1187055.0, 712905.0, 1194735.0), Extent(559305.0, 1379055.0, 566985.0, 1386735.0))


In [ ]:
// In second element of RDD, we have details of number cols and rows of each Tile in RDD
band.map(l => l._2).take(10)

res17: Array[geotrellis.raster.Tile] = Array(DoubleConstantNoDataArrayTile([D@5c6074be,256,256), DoubleConstantNoDataArrayTile([D@2ab20ee2,256,256), DoubleConstantNoDataArrayTile([D@7ad3b249,256,256), DoubleConstantNoDataArrayTile([D@3fcdf7d5,256,256), DoubleConstantNoDataArrayTile([D@6057a116,256,256), DoubleConstantNoDataArrayTile([D@5a2866e0,256,256), DoubleConstantNoDataArrayTile([D@3d8ed616,256,256), DoubleConstantNoDataArrayTile([D@6d2ded2d,256,256), DoubleConstantNoDataArrayTile([D@302fae8f,256,256), DoubleConstantNoDataArrayTile([D@2929d10c,256,256))


# We can convert ProjectedExtent to SpatialKey...

In Spark - Geotrellis, instead of using ProjectedExtent, we can also use SpatialKey.

A SpatialKey indicates the spatial positioning of a layer's tile. It's a grid or cube of Tiles. But in resume, we can say TileLayer is Tile simply..

In [ ]:
// We find metadata ("or "Layer Metadata", stores information critical to Tile Layer IO.")
val (_, metadata) = band.collectMetadata[SpatialKey](FloatingLayoutScheme())

// and tiles
val tiles = band.tileToLayout[SpatialKey](metadata)

metadata: geotrellis.spark.TileLayerMetadata[geotrellis.spark.SpatialKey] = TileLayerMetadata(float64,GridExtent(Extent(528585.0, 1156335.0, 758985.0, 1394415.0),30.0,30.0),Extent(528585.0, 1162785.0, 755415.0, 1394415.0),EPSG:32648,KeyBounds(SpatialKey(0,0),SpatialKey(29,30)))
tiles: org.apache.spark.rdd.RDD[(geotrellis.spark.SpatialKey, geotrellis.raster.Tile)] = ShuffledRDD[21] at reduceByKey at TileRDDMerge.scala:51


In [ ]:
// What will we have in SpatialKey?
tiles.map(l => l._1).take(10)

res20: Array[geotrellis.spark.SpatialKey] = Array(SpatialKey(10,26), SpatialKey(28,27), SpatialKey(6,17), SpatialKey(0,24), SpatialKey(27,11), SpatialKey(23,1), SpatialKey(24,14), SpatialKey(14,27), SpatialKey(29,13), SpatialKey(7,3))


In [ ]:
/*
So we can see clearly here that we have in SpatialKey, we have 30 columns and 31 rows so the number of TileLayers is 30*31 = 960
It's also number of Tiles!
*/
tiles.map(l => (l._1.col, l._1.row)).map(l => (l._1, 1)).reduceByKey(_ + _).take(50)

res22: Array[(Int, Int)] = Array((0,31), (1,31), (2,31), (3,31), (4,31), (5,31), (6,31), (7,31), (8,31), (9,31), (10,31), (11,31), (12,31), (13,31), (14,31), (15,31), (16,31), (17,31), (18,31), (19,31), (20,31), (21,31), (22,31), (23,31), (24,31), (25,31), (26,31), (27,31), (28,31), (29,31))


# From RDD[SpatialKey, Tile] to Raster

In [ ]:
// To be able to save to HDFS, we always need to convert RDD[ProjectedExtent, Tile] to Raster[Tile]
// Here, we have many Tiles in this RDD, so we need to stitch all of them together
// In this version of Geotrellis, we always have inside RDD multiple Tile (SingleBand or MultiBand)
val raster = ContextRDD(tiles, metadata).stitch

raster: geotrellis.raster.Raster[geotrellis.raster.Tile] = Raster(DoubleConstantNoDataArrayTile([D@4de0e9c9,7680,7936),Extent(528585.0, 1156335.0, 758985.0, 1394415.0))


# From Raster to GeoTiff

GeoTiff is raster data, but raster data needs to include CRS to become GeoTiff!

In [ ]:
val geotiff = GeoTiff(raster, metadata.crs)

geotiff: geotrellis.raster.io.geotiff.SinglebandGeoTiff = SinglebandGeoTiff(DoubleConstantNoDataArrayTile([D@4de0e9c9,7680,7936),Extent(528585.0, 1156335.0, 758985.0, 1394415.0),EPSG:32648,Tags(Map(),List()),GeoTiffOptions(geotrellis.raster.io.geotiff.Striped@36dc0b51,geotrellis.raster.io.geotiff.compression.NoCompression$@6a859299,1,None))
