In [None]:
def get_con():
    import duckdb
    con = duckdb.connect("duckdb/san_fran_ept_lpc.ddb")
    con.sql("""
        SET temp_directory = 'notebooks/tmp/';
        SET s3_region = 'us-west-2';
        INSTALL h3 FROM community;
        INSTALL httpfs;
        INSTALL spatial;
        INSTALL pdal FROM community;
        LOAD h3;
        LOAD httpfs;
        LOAD spatial;
        LOAD pdal;
        SET enable_progress_bar = true;
    """)
    return con

In [2]:
con.sql("SELECT * FROM PDAL_Info('https://s3-us-west-2.amazonaws.com/usgs-lidar-public/CA_SanFrancisco_1_B23/ept.json')")

┌─────────────────────────────────────────────────────────────────────────────────────┬─────────────┬─────────────┬───────────┬────────┬─────────────┬───────────┬────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬──────────────┬────────────┬────────────────┬────────────────┬─────────────────┬──────────────────────────────────────┬───────────────┬───────────────

In [None]:
# # Config
# ept_url = "https://s3-us-west-2.amazonaws.com/usgs-lidar-public/CA_SanFrancisco_1_B23/ept.json"
# src_crs = 'EPSG:3857'
# dst_crs = 'EPSG:4326'
# res = 11

# # Full dataset extent from PDAL_Info metadata
# bbox_min_x, bbox_min_y = -13638426, 4536715
# bbox_max_x, bbox_max_y = -13617318, 4556481

# # Pass bounds to readers.ept via PDAL_Read options
# bounds = f"([{bbox_min_x},{bbox_max_x}],[{bbox_min_y},{bbox_max_y}])"

# lat= f"ST_Y(ST_Transform(ST_Point(X, Y), '{src_crs}', '{dst_crs}', always_xy := true))"
# lng =f"ST_X(ST_Transform(ST_Point(X, Y), '{src_crs}', '{dst_crs}', always_xy := true))"
# query = f"""

# CREATE OR REPLACE TABLE san_fran_res_11 as
# SELECT 
#     h3_latlng_to_cell({lat}, {lng}, {res}) AS h3_index,
#     AVG(Z) AS avg_elevation,
#     MIN(Z) AS min_z,
#     MAX(Z) AS max_z,
#     MAX(Z) - MIN(Z) AS z_range,
#     COUNT(1) AS cnt
# FROM PDAL_Read('{ept_url}', options => MAP {{'bounds': '{bounds}'}})
# GROUP BY 1;
# """

# print(query)
# con.sql(query)



CREATE OR REPLACE TABLE san_fran_res_11 as
SELECT 
    h3_latlng_to_cell(ST_Y(ST_Transform(ST_Point(X, Y), 'EPSG:3857', 'EPSG:4326', always_xy := true)), ST_X(ST_Transform(ST_Point(X, Y), 'EPSG:3857', 'EPSG:4326', always_xy := true)), 11) AS h3_index,
    AVG(Z) AS avg_elevation,
    MIN(Z) AS min_z,
    MAX(Z) AS max_z,
    MAX(Z) - MIN(Z) AS z_range,
    COUNT(1) AS cnt
FROM PDAL_Read('https://s3-us-west-2.amazonaws.com/usgs-lidar-public/CA_SanFrancisco_1_B23/ept.json', options => MAP {'bounds': '([-13638426,-13617318],[4536715,4556481])'})
GROUP BY 1;



: 

# CLAUDE: please make the following async with conccurrent future:

In [None]:
import concurrent.futures
import mercantile
import pyarrow as pa
import time

# Config
ept_url = "https://s3-us-west-2.amazonaws.com/usgs-lidar-public/CA_SanFrancisco_1_B23/ept.json"
src_crs = 'EPSG:3857'
dst_crs = 'EPSG:4326'
res = 11

# Full dataset extent from PDAL_Info metadata
bbox_min_x, bbox_min_y = -13638426, 4536715
bbox_max_x, bbox_max_y = -13617318, 4556481

lat = f"ST_Y(ST_Transform(ST_Point(X, Y), '{src_crs}', '{dst_crs}', always_xy := true))"
lng = f"ST_X(ST_Transform(ST_Point(X, Y), '{src_crs}', '{dst_crs}', always_xy := true))"

# Use mercantile to tile the bbox — aligns with how EPT octree is organized
sw = mercantile.lnglat(bbox_min_x, bbox_min_y)
ne = mercantile.lnglat(bbox_max_x, bbox_max_y)
zoom = 18

tiles = list(mercantile.tiles(sw.lng, sw.lat, ne.lng, ne.lat, zooms=zoom))
print(f"{len(tiles)} tiles at z{zoom}")


def process_tile(tile):
    """Process a single tile with its own DuckDB connection."""
    con = get_con()
    tb = mercantile.xy_bounds(tile)
    tile_bounds = f"([{tb.left},{tb.right}],[{tb.bottom},{tb.top}])"
    result = con.sql(f"""
        SELECT 
            h3_latlng_to_cell({lat}, {lng}, {res}) AS hex,
            AVG(Z) AS avg_elevation,
            MIN(Z) AS min_z,
            MAX(Z) AS max_z,
            MAX(Z) - MIN(Z) AS z_range,
            COUNT(1) AS cnt
        FROM PDAL_Read('{ept_url}', options => MAP {{'bounds': '{tile_bounds}'}})
        GROUP BY 1
    """).fetch_arrow_table()
    con.close()
    return tile, result


max_workers = 10  # tune based on memory / CPU
start = time.time()
results = []

with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = {executor.submit(process_tile, t): t for t in tiles}
    for future in concurrent.futures.as_completed(futures):
        tile = futures[future]
        try:
            _, tbl = future.result()
            results.append(tbl)
            print(f"  z{zoom}/{tile.x}/{tile.y} — {tbl.num_rows} hex ({time.time()-start:.0f}s)")
        except Exception as e:
            print(f"  z{zoom}/{tile.x}/{tile.y} FAILED: {e}")

# Combine and re-aggregate hex spanning tile boundaries
combined = pa.concat_tables(results)
con = get_con()
con.sql("CREATE OR REPLACE TABLE san_fran_res_11 AS SELECT * FROM combined")
con.sql("""
    CREATE OR REPLACE TABLE san_fran_res_11 AS
    SELECT hex,
           SUM(avg_elevation * cnt) / SUM(cnt) AS avg_elevation,
           MIN(min_z) AS min_z, MAX(max_z) AS max_z,
           MAX(max_z) - MIN(min_z) AS z_range,
           SUM(cnt) AS cnt
    FROM san_fran_res_11
    GROUP BY 1
""")

elapsed = time.time() - start
df = con.sql("FROM san_fran_res_11").df()
print(f"\n{len(df)} hex, {df['cnt'].sum():,} points, {elapsed:.1f}s")

In [None]:
con.table('san_fran_res_11')
con.sql("describe san_fran_res_11")

In [4]:
# # Quick test - just read 10 raw points with bounds, no transform, no agg
# con.sql(f"""
# SELECT X, Y, Z, Classification, Intensity 
# FROM PDAL_Read('{ept_url}', options => MAP {{'bounds': '{bounds}'}})
# LIMIT 10
# """)

In [None]:
# Run it - safety bail at max_points
# Bump max_points or set to None once you're confident
import time

start = time.time()
result = con.sql(query).fetch_arrow_table()


In [None]:
df = result.to_pandas()
elapsed = time.time() - start

print(f"{len(df)} hexagons, {df['cnt'].sum():,} points, {elapsed:.1f}s")
df.head(20)

33 hexagons, 234,480,604 points, 415.6s


Unnamed: 0,h3_index,avg_elevation,min_z,max_z,z_range,cnt
0,617700169965895679,19.16073,3.42,41.07,37.65,10058968
1,617700170047160319,28.007489,14.05,70.23,56.18,10344363
2,617700170031955967,21.403428,8.68,47.18,38.5,3521386
3,617700170046636031,60.936497,19.08,109.21,90.13,5615879
4,617700169966419967,12.33877,-0.54,34.93,35.47,9982447
5,617700169971400703,14.168135,3.49,40.78,37.29,5630237
6,617700169958817791,21.889214,9.9,36.34,26.44,113372
7,617700170054500351,105.184819,70.58,132.66,62.08,4953889
8,617700170045587455,34.630431,23.09,55.59,32.5,4576644
9,617700170031169535,17.147354,1.49,47.84,46.35,9810494
