# Intro to gis with pyspark

- https://spark.apache.org/docs/latest/api/python/getting_started/quickstart_ps.html
- https://sedona.apache.org/latest/sedonaspark

## Instructions

1. install python + packages listed in requirements.txt
2. install java 21 (I'm using openjdk, jre only should work)

In [1]:
from pyspark.sql import SparkSession
from sedona.spark import SedonaContext
from pathlib import Path
import geopandas as gpd

In [None]:
%%capture

# pyspark can't read .gpkg natively
# Apache Sedona adds geospatial readers to Spark
# TODO: update jar to latest versions
sedona = (
    SedonaContext.builder()
    .master("local[*]")
    .appName("gis_intro")
    .config(
        "spark.jars.packages",
        "org.apache.sedona:sedona-spark-shaded-3.5_2.13:1.8.1,"
        "org.datasyslab:geotools-wrapper:1.8.1-33.1,"
        "org.json4s:json4s-jackson_2.13:3.7.0-M11",
    )
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .config(
        "spark.sql.extensions", "org.apache.sedona.spark.CatalystSpark3ShimExtension"
    )
    .getOrCreate()
)

In [3]:
# load expects string, Path() creates pathlib.Path object
gadm_path = str(Path("..", "data", "gadm_aus.gpkg"))

In [4]:
# list all layers for sedona import
gpd.list_layers(gadm_path)

Unnamed: 0,name,geometry_type
0,ADM_ADM_0,MultiPolygon
1,ADM_ADM_1,MultiPolygon
2,ADM_ADM_2,MultiPolygon


In [5]:
# spark dataframe (country level boundary)
gpkg_raw = (
    sedona.read.format("geopackage").option("tableName", "ADM_ADM_0").load(gadm_path)
)

In [6]:
# show up to 5 layers
gpkg_raw.show(5)

                                                                                

+---+--------------------+-----+---------+
|fid|                geom|GID_0|  COUNTRY|
+---+--------------------+-----+---------+
|  1|MULTIPOLYGON (((1...|  AUS|Australia|
+---+--------------------+-----+---------+



In [7]:
del gpkg_raw

In [8]:
# ADM_ADM_0: country level
gpkg_country = (
    sedona.read.format("geopackage").option("tableName", "ADM_ADM_0").load(gadm_path)
)

# ADM_ADM_1: states/territories
gpkg_states = (
    sedona.read.format("geopackage").option("tableName", "ADM_ADM_1").load(gadm_path)
)

# ADM_ADM_2: local government areas
gpkg_lgas = (
    sedona.read.format("geopackage").option("tableName", "ADM_ADM_2").load(gadm_path)
)

In [9]:
# confirm pyspark
type(gpkg_country)

pyspark.sql.classic.dataframe.DataFrame

In [10]:
# set truncate=true to prevent out of memory error
gpkg_country.show(5)

                                                                                

+---+--------------------+-----+---------+
|fid|                geom|GID_0|  COUNTRY|
+---+--------------------+-----+---------+
|  1|MULTIPOLYGON (((1...|  AUS|Australia|
+---+--------------------+-----+---------+



In [11]:
gpkg_states.show(5)

+---+--------------------+-------+-----+---------+--------------------+---------+---------+---------+---------+----+------+------+
|fid|                geom|  GID_1|GID_0|  COUNTRY|              NAME_1|VARNAME_1|NL_NAME_1|   TYPE_1|ENGTYPE_1|CC_1|HASC_1| ISO_1|
+---+--------------------+-------+-----+---------+--------------------+---------+---------+---------+---------+----+------+------+
|  1|MULTIPOLYGON (((1...|AUS.1_1|  AUS|Australia|Ashmore and Carti...|       NA|       NA|Territory|Territory|  12| AU.AS|    NA|
|  2|MULTIPOLYGON (((1...|AUS.2_1|  AUS|Australia|Australian Capita...|       NA|       NA|Territory|Territory|   8| AU.AC|AU-ACT|
|  3|MULTIPOLYGON (((1...|AUS.3_1|  AUS|Australia|Coral Sea Islands...|       NA|       NA|Territory|Territory|  11| AU.CR|    NA|
|  4|MULTIPOLYGON (((1...|AUS.4_1|  AUS|Australia|Jervis Bay Territory|       NA|       NA|Territory|Territory|  10| AU.JB|    NA|
|  5|MULTIPOLYGON (((1...|AUS.5_1|  AUS|Australia|     New South Wales|       NA|  

In [12]:
gpkg_lgas.show(5)

+---+--------------------+---------+-----+---------+-------+--------------------+---------+--------------------+---------+---------+-------------------+-------------------+-----+--------+
|fid|                geom|    GID_2|GID_0|  COUNTRY|  GID_1|              NAME_1|NL_NAME_1|              NAME_2|VARNAME_2|NL_NAME_2|             TYPE_2|          ENGTYPE_2| CC_2|  HASC_2|
+---+--------------------+---------+-----+---------+-------+--------------------+---------+--------------------+---------+---------+-------------------+-------------------+-----+--------+
|  1|MULTIPOLYGON (((1...|AUS.1.1_1|  AUS|Australia|AUS.1_1|Ashmore and Carti...|       NA|Ashmore and Carti...|       NA|       NA|          Territory|          Territory|   NA|      NA|
|  2|MULTIPOLYGON (((1...|AUS.2.1_1|  AUS|Australia|AUS.2_1|Australian Capita...|       NA|Unincorporated Au...|       NA|       NA|Unincorporated Area|Unincorporated Area|89399|      NA|
|  3|MULTIPOLYGON (((1...|AUS.3.1_1|  AUS|Australia|AUS.3_1|