In [None]:
from sedona.spark import *
from pyspark.sql.functions import col, count, countDistinct
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DateType
import sys, os
from sedona.spark.sql.st_constructors import ST_Point
from sedona.spark.sql.st_functions import GeometryType
from sedona.spark import SedonaKepler
from pyspark.sql import functions as F
from itertools import product
from sedona.spark.geopandas import GeoDataFrame, read_parquet
from sedona.spark import SedonaContext




In [None]:
# Create a SedonaContext (or SparkSession)
#spark = SedonaContext.builder().appName("ImportCSVDirectory").getOrCreate()
#sedona = SedonaContext.create(spark)
# For anonymous access to public S3 buckets
sd = (
    SedonaContext.builder()
    .config(
        "spark.hadoop.fs.s3a.bucket.bucket-name.aws.credentials.provider",
        "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider",
    )
    .getOrCreate()
)

sedona = SedonaContext.create(sd)

In [None]:

# Path to the directory containing your CSV files
directory_path = "../2024/"

# Read all CSV files from the directory into a DataFrame
# Assuming your CSV files have a header
#custom_schema = StructType([
#StructField("STATION", StringType(), True),
#StructField("DATE", DateType(), True),
#StructField("LATITUDE", FloatType(), True),
#StructField("LONGITUDE", FloatType(), True),
#StructField("HourlyPressureChange", FloatType(), True)
#])
#df = spark.read.option("header", True).schema(custom_schema).format("csv").load(directory_path)
df = sd.read.option("header", True).format("csv").load(directory_path)



# Show the DataFrame (optional)
df.show(2)

# If your CSV files contain geometry data in WKT format (e.g., in a column named "geom_wkt"),
# you can convert it to a Sedona geometry column:
# from sedona.sql.functions import ST_GeomFromText
# df = df.withColumn("geometry", ST_GeomFromText(col("geom_wkt"))).drop("geom_wkt")

# Now you can work with the DataFrame containing all your CSV data

In [None]:
df_p_diff = df.select(ST_Point(col("LONGITUDE"), col("LATITUDE")).alias("GEOMETRY"),"STATION", "DATE", "LATITUDE", "LONGITUDE", "ELEVATION", "NAME", "REPORT_TYPE", "SOURCE", "HourlyDryBulbTemperature", "HourlyPressureChange", "HourlyPressureTendency", "HourlySeaLevelPressure", "HourlyStationPressure", "HourlyWindDirection", "HourlyWindGustSpeed", "HourlyWindSpeed").filter(col("HourlyPressureChange").isNotNull())
#print(df_p_diff.groupBy("STATION").agg(countDistinct("DATE").alias("HPC_count")))
df_p_diff.select(countDistinct('STATION')).show()
#print(df.groupBy("STATION").agg(countDistinct("DATE").alias("FULL_count")))
df.select(countDistinct('STATION')).show()

In [None]:
from sedona.spark import KNNQuery
from shapely.geometry import Point

spatialRDD = StructuredAdapter.toSpatialRdd(df_p_diff, "GEOMETRY")

k = 100 ## K Nearest Neighbors
using_index = False

result = KNNQuery.SpatialKnnQuery(spatialRDD, spatial_df.first()['geometry'], k, using_index)
print(result)

In [None]:
spatialRDD.analyze()

In [None]:
%%time
spatialRDD.spatialPartitioning(GridType.KDBTREE, 5000)
spatialRDD.buildIndex(IndexType.RTREE, False)

In [None]:
spatialRDD.buildIndex(IndexType.RTREE, True)

In [None]:
spatialRDD.rawSpatialRDD.getNumPartitions()

In [None]:
%%time
k = 100
using_index = True
result = KNNQuery.SpatialKnnQuery(spatialRDD, spatial_df.first()['geometry'], k, using_index)
print(result)

In [None]:
%%time
spatialRDD.buildIndex(IndexType.RTREE, False)

In [None]:
#Player around.  Use some of this code.
df_pressure_diff = df.select(ST_Point(col("LONGITUDE"), col("LATITUDE")),"STATION", "DATE", "LATITUDE", "LONGITUDE", "ELEVATION", "NAME", "REPORT_TYPE", "SOURCE", "HourlyDryBulbTemperature", "HourlyPressureChange", "HourlyPressureTendency", "HourlySeaLevelPressure", "HourlyStationPressure", "HourlyWindDirection", "HourlyWindGustSpeed", "HourlyWindSpeed").filter(col("HourlyPressureChange").isNotNull()).show()
df.filter(col("HourlySeaLevelPressure").isNotNull() | col("HourlyStationPressure").isNotNull()).count()
#counts of records with different filters
df.count()
#130,112,717 37,352,572 77,319,947 77,210,243
aggregated = df.groupby('STATION', 'DATE').agg({abs('HourlyPressureChange'): 'min', abs('HourlyPressureChange'): 'min'})
df['AbsPressure'] = df['HourlyPressureChange'].abs()
from pyspark.sql import functions as F
df = df.withColumn("absPressure", F.abs(F.col("HourlyPressureChange")))
df.tail(10)
df.agg(F.min('HourlyPressureChange')).show()
df.filter(col('HourlyPressureChange').isNotNull()).withColumn("HourlyPressureChange", col("HourlyPressureChange").cast(FloatType())).groupby('STATION', 'DATE')\
.agg({abs('HourlyPressureChange'): 'min', abs('HourlyPressureChange'): 'min'})

In [None]:
#Using Overtures instread
#import geopandas as gpd

#url = "https://naciscdn.org/naturalearth/110m/cultural/ne_110m_admin_0_countries.zip"


#gdf = gpd.read_file(url)
#gdf
#df_conus = sedona.createDataFrame(gdf[(gdf.SOV_A3=='US1') & (gdf.TYPE=='Country')][['SOVEREIGNT', 'geometry']])
#map = SedonaKepler.create_map(df=df_conus, name="CONUS")
#map


In [None]:
# 2. Define a coordinate grid with a 0.5-degree step
longitude_step = 0.5
latitude_step = 0.5

longitudes = [i * longitude_step for i in range(int(-180 / longitude_step), int(180 / longitude_step) + 1)]
latitudes = [i * latitude_step for i in range(int(-90 / latitude_step), int(90 / latitude_step) + 1)]

# 3. Generate a list of all coordinate pairs
coordinate_pairs = list(product(longitudes, latitudes))

# 4. Create a Spark DataFrame from the list of coordinates
schema = ["longitude", "latitude"]
df_lat_lon = sedona.createDataFrame(coordinate_pairs, schema=schema)

# 5. Create the Sedona geometry points
# ST_Point takes longitude first, then latitude.
spatial_df = df_lat_lon.withColumn(
    "geometry",
    F.expr(f"ST_Point(longitude, latitude)")
)

# Show the resulting DataFrame
print("Generated spatial DataFrame:")
spatial_df.show(5)
spatial_df.printSchema()

In [None]:
spatial_df.first()['geometry']

In [None]:
df_boundaries = sd.read_parquet("s3://overturemaps-us-west-2/release/2025-09-24.0/theme=divisions/type=division_area/*.parquet")
df.show(3)

In [None]:
OVERTURE_RELEASE = "2025-09-24.0"
COUNTRY_CODES_OF_INTEREST = ["US"]
SOURCE_DATA_URL = f"s3a://overturemaps-us-west-2/release/{OVERTURE_RELEASE}/theme=divisions/type=division_area"
OUTPUT_FILE = "my_super_cool_data.parquet"

In [None]:
country_overlap_condition = F.arrays_overlap(
    F.col("country"),
    F.array(*[F.lit(x.upper()) for x in COUNTRY_CODES_OF_INTEREST]),
)

In [None]:
source_df = (
    sd.read.format("geoparquet")
    .load(SOURCE_DATA_URL)
    .filter(col("country").isin(COUNTRY_CODES_OF_INTEREST))
    #.filter(col("region")=='US-CA')
    .filter(col("subtype")=='country')
    .withColumn("_overture_release_version", F.lit(OVERTURE_RELEASE))
    .withColumn("_ingest_timestamp", F.current_timestamp())
)

In [None]:
USA_geom = source_df.selectExpr("geometry", "country")
USA_geom.show(5)
map = SedonaKepler.create_map(USA_geom, name="USA")
map

In [None]:
#Used for testing with just CA
#CA_geom = source_df.selectExpr("geometry", "region")
#CA_geom.show(5)
#map1 = SedonaKepler.create_map(CA_geom, name="CA")
#map1
#
#CA_geom = source_df.selectExpr("geometry", "region").filter(GeometryType(col('geometry'))=='MULTIPOLYGON')
#CA_geom.show(5)
#map2 = SedonaKepler.create_map(CA_geom, name="CA")
#map2
#
#CA_geom = source_df.selectExpr("geometry", "region").filter(GeometryType(col('geometry'))=='POLYGON')
#CA_geom.show(5)
#map3 = SedonaKepler.create_map(CA_geom, name="CA")
#map3