### Import data to Sedona

In [None]:
import xarray as xr
import os

home_dir = "/home/uribe-55/sedona_experiments"
data_dir = os.path.join(home_dir, "processed_data")
ds = xr.open_dataset(os.path.join(data_dir, "2m_temperature_GL_2015-2024.nc"))

In [None]:
from sedona.spark import SedonaContext
from pyspark.sql.functions import expr

# Convert xarray to Pandas DataFrame
df = ds['2m_temperature'].to_dataframe().reset_index()

# Start Spark + Sedona
config = SedonaContext.builder(). \
    config('spark.jars.repositories', 'https://artifacts.unidata.ucar.edu/repository/unidata-all/'). \
    config('spark.jars.packages',
           'org.apache.sedona:sedona-spark-3.5_2.12:1.7.1,'
           'org.datasyslab:geotools-wrapper:1.7.1-28.5,'
           'edu.ucar:cdm-core:5.4.2'). \
    config('spark.driver.memory', '10g'). \
    config('spark.driver.maxResultSize', '5g'). \
    config('spark.network.timeout', '1000s'). \
    getOrCreate()
sedona = SedonaContext.create(config)

# Convert Pandas DataFrame to Spark DataFrame
sdf = config.createDataFrame(df)

# Create a geometry column for spatial queries
sdf = sdf.withColumn("geom", expr("ST_Point(cast(longitude as double), cast(latitude as double))"))
# sdf.show()

# Register as temp view
sdf.createOrReplaceTempView("temp_view")

### Get Variable Query

In [None]:
import pyspark.sql.functions as f

# Inputs
min_lat = 70
max_lat = 80
min_lon = -60
max_lon = -50
start_time = "2022-01-01 00:00:00"
end_time = "2023-01-02 00:00:00"
agg = "mean"
temp_agg = f"{f.day('time')}"


Query

In [None]:
query = f"""
    SELECT time, latitude, longitude, mean(`2m_temperature`)
    FROM temp_view
    WHERE latitude BETWEEN {min_lat} AND {max_lat}
        AND longitude BETWEEN {min_lon} AND {max_lon} 
    GROUPBY  {temp_agg}, latitude , longitude
    """

Result

In [None]:
result = sedona.sql(query)
result.show()

In [None]:
sedona.stop()