### GeoSpark
GeoSpark is a cluster computing system for processing large-scale spatial data.
GeoSpark extends Apache Spark with a set of out-of-the-box Spatial Resilient Distributed
Datasets (SRDDs) that efficiently load, process, and analyze large-scale spatial data across 
machines. GeoSpark provides APIs for Apache Spark programmer to easily develop their spatial 
analysis programs with Spatial Resilient Distributed Datasets (SRDDs) which have in house support 
for geometrical and Spatial Queries (Range, K Nearest Neighbors, Join).


In [ ]:
import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.storage.StorageLevel;

import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.storage.StorageLevel


In [ ]:
/*---------------------------- Start an example Spatial Range Query without Index ----------------------------*/
import org.datasyslab.geospark.spatialOperator.RangeQuery; 
import org.datasyslab.geospark.spatialRDD.PointRDD;
import com.vividsolutions.jts.geom.Envelope;
import org.datasyslab.geospark.enums.FileDataSplitter;

import org.datasyslab.geospark.spatialOperator.RangeQuery
import org.datasyslab.geospark.spatialRDD.PointRDD
import com.vividsolutions.jts.geom.Envelope
import org.datasyslab.geospark.enums.FileDataSplitter


In [ ]:
val queryEnvelope=new Envelope (-113.79,-109.73,32.99,35.08);
/* Range query window format: minX, maxX, minY, maxY*/
val objectRDD = new PointRDD(sc, "hdfs://hupi-factory-02-01-01-01:8020/user/hupi/dataset_torusGeoSpark/arealm.csv", 0, 
                             FileDataSplitter.CSV, false, StorageLevel.MEMORY_ONLY);
/* 
 * 0 is the starting column of spatial data in the input file. 
 * FileDataSplitter.CSV means the data format is CSV. We CSV, TSV, WKT, GeoJSON and self-defined format mapper.
 * false means each spatial object doesn't need to carry the original input tuple with it.
 */
val resultSize = RangeQuery.SpatialRangeQuery(objectRDD, queryEnvelope, false, false).count();

queryEnvelope: com.vividsolutions.jts.geom.Envelope = Env[-113.79 : -109.73, 32.99 : 35.08]
objectRDD: org.datasyslab.geospark.spatialRDD.PointRDD = org.datasyslab.geospark.spatialRDD.PointRDD@7a15fad
resultSize: Long = 445


In [ ]:
/* 
 * The O means consider a point only if it is fully covered by the query window when doing query.
 * The first false means don't consider objects intersect but not fully covered by the query window.
 * The second false means don't use spatial index.
 */

In [ ]:
/*---------------------------- Start an example Spatial Range Query with Index ----------------------------*/
import org.datasyslab.geospark.spatialOperator.RangeQuery; 
import org.datasyslab.geospark.spatialRDD.PointRDD;
import com.vividsolutions.jts.geom.Envelope;
import org.datasyslab.geospark.enums.FileDataSplitter;
import org.datasyslab.geospark.enums.IndexType;

val queryEnvelope=new Envelope (-113.79,-109.73,32.99,35.08);
/* Range query window format: minX, maxX, minY, maxY*/
val objectRDD = new PointRDD(sc, "hdfs://hupi-factory-02-01-01-01:8020/user/hupi/dataset_torusGeoSpark/arealm.csv", 0, 
                             FileDataSplitter.CSV, false, StorageLevel.MEMORY_ONLY);
/* 
 * 0 is the starting column of spatial data in the input file. 
 * FileDataSplitter.CSV enum means the data format is CSV. We CSV, TSV, WKT, GeoJSON and self-defined format mapper.
 * false means each spatial object doesn't need to carry the original input tuple with it.
 */
objectRDD.buildIndex(IndexType.RTREE,false);
/* 
 * IndexType.RTREE enum means the index type is R-tree. We support R-Tree index and Quad-Tree index.
 * false means just build index on original spatial RDD instead of spatial partitioned RDD. ONLY set true when doing Spatial Join Query.
 */
val resultSize = RangeQuery.SpatialRangeQuery(objectRDD, queryEnvelope, false, true).count();
/* 
 * The O means consider a point only if it is fully covered by the query window when doing query.
 * The first false means don't consider objects intersect but not fully covered by the query window.
 * The true means use spatial index which has been built before.
 */
/*---------------------------- End an example Spatial Range Query with Index ----------------------------*/

import org.datasyslab.geospark.spatialOperator.RangeQuery
import org.datasyslab.geospark.spatialRDD.PointRDD
import com.vividsolutions.jts.geom.Envelope
import org.datasyslab.geospark.enums.FileDataSplitter
import org.datasyslab.geospark.enums.IndexType
queryEnvelope: com.vividsolutions.jts.geom.Envelope = Env[-113.79 : -109.73, 32.99 : 35.08]
objectRDD: org.datasyslab.geospark.spatialRDD.PointRDD = org.datasyslab.geospark.spatialRDD.PointRDD@1e5f5126
resultSize: Long = 445
