### Import data to Sedona

In [None]:
import xarray as xr
import os

data_dir = "/home/uribe055/sedona_experiments/processed_data"

In [3]:
from sedona.spark import SedonaContext
from pyspark.sql.functions import expr

# Start Spark + Sedona
config = SedonaContext.builder(). \
    config('spark.jars.repositories', 'https://artifacts.unidata.ucar.edu/repository/unidata-all/'). \
    config('spark.jars.packages',
           'org.apache.sedona:sedona-spark-3.5_2.12:1.7.1,'
           'org.datasyslab:geotools-wrapper:1.7.1-28.5,'
           'edu.ucar:cdm-core:5.4.2'). \
    config('spark.driver.memory', '10g'). \
    config('spark.driver.maxResultSize', '5g'). \
    config('spark.network.timeout', '1000s'). \
    getOrCreate()
sedona = SedonaContext.create(config)


25/05/25 13:21:09 WARN Utils: Your hostname, cs-spatial-501 resolves to a loopback address: 127.0.0.1; using 128.101.33.153 instead (on interface eno1)
25/05/25 13:21:09 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
https://artifacts.unidata.ucar.edu/repository/unidata-all/ added as a remote repository with the name: repo-1
Ivy Default Cache set to: /home/uribe055/.ivy2/cache
The jars for the packages stored in: /home/uribe055/.ivy2/jars
org.apache.sedona#sedona-spark-3.5_2.12 added as a dependency
org.datasyslab#geotools-wrapper added as a dependency
edu.ucar#cdm-core added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-1557d79c-d64e-4544-b92b-94f71caf9a7d;1.0
	confs: [default]


:: loading settings :: url = jar:file:/home/uribe055/sedona_experiments/venv/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found org.apache.sedona#sedona-spark-3.5_2.12;1.7.1 in central
	found org.apache.sedona#sedona-common;1.7.1 in central
	found org.apache.commons#commons-math3;3.6.1 in central
	found org.locationtech.jts#jts-core;1.20.0 in central
	found org.wololo#jts2geojson;0.16.1 in central
	found org.locationtech.spatial4j#spatial4j;0.8 in central
	found com.google.geometry#s2-geometry;2.0.0 in central
	found com.google.guava#guava;25.1-jre in central
	found com.google.code.findbugs#jsr305;3.0.2 in central
	found org.checkerframework#checker-qual;2.0.0 in central
	found com.google.errorprone#error_prone_annotations;2.1.3 in central
	found com.google.j2objc#j2objc-annotations;1.1 in central
	found org.codehaus.mojo#animal-sniffer-annotations;1.14 in central
	found com.uber#h3;4.1.1 in central
	found net.sf.geographiclib#GeographicLib-Java;1.52 in central
	found com.github.ben-manes.caffeine#caffeine;2.9.2 in central
	found org.checkerframework#checker-qual;3.10.0 in central
	found com.google.error

In [None]:
df = sedona.read.format("binaryFile").load(os.path.join(data_dir, "2m_temperature_GL_2020-2024.nc"))
df = df.withColumn("raster", expr("RS_FromNetCDF(content, '2m_temperature', 'longitude', 'latitude')"))
df = df.withColumn("pixels", expr("RS_Pixels(raster)"))

In [None]:
from sedona.spark import SedonaContext
from pyspark.sql.functions import expr

# Convert xarray to Pandas DataFrame
# df = ds['2m_temperature'].to_dataframe().reset_index()

# May need to load in daily datasets into spark dataframes, then 

# Start Spark + Sedona
config = SedonaContext.builder(). \
    config('spark.jars.repositories', 'https://artifacts.unidata.ucar.edu/repository/unidata-all/'). \
    config('spark.jars.packages',
           'org.apache.sedona:sedona-spark-3.5_2.12:1.7.1,'
           'org.datasyslab:geotools-wrapper:1.7.1-28.5,'
           'edu.ucar:cdm-core:5.4.2'). \
    config('spark.driver.memory', '10g'). \
    config('spark.driver.maxResultSize', '5g'). \
    config('spark.network.timeout', '1000s'). \
    getOrCreate()
sedona = SedonaContext.create(config)

# Convert Pandas DataFrame to Spark DataFrame
sdf = config.createDataFrame(df)

# Create a geometry column for spatial queries
sdf = sdf.withColumn("geom", expr("ST_Point(cast(longitude as double), cast(latitude as double))"))
# sdf.show()

# Register as temp view
sdf.createOrReplaceTempView("temp_view")

### Get Variable Query

In [None]:
import pyspark.sql.functions as f

# Inputs
min_lat = 70
max_lat = 80
min_lon = -60
max_lon = -50
start_time = "2022-01-01 00:00:00"
end_time = "2023-01-02 00:00:00"
agg = "mean"
temp_agg = f"{f.day('time')}"


Query

In [None]:
query = f"""
    SELECT time, latitude, longitude, mean(`2m_temperature`)
    FROM temp_view
    WHERE latitude BETWEEN {min_lat} AND {max_lat}
        AND longitude BETWEEN {min_lon} AND {max_lon} 
    GROUPBY  {temp_agg}, latitude , longitude
    """

Result

In [None]:
result = sedona.sql(query)
result.show()

In [None]:
sedona.stop()