# DuckDB + Parquet Data Exploration Template

In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import folium
from pathlib import Path
from tqdm import tqdm
from branca.colormap import linear

# --- Configuration ---

In [None]:
pths = list(Path("/Users/kyledorman/data/planet_coverage/points_30km/skysat/results").glob("*/*/*/*/data.parquet"))
len(pths)

In [None]:
from shapely import wkb
import tqdm

for pth in tqdm.tqdm(pths):
    # load the parquet into pandas to rebuild geometries
    df_pd: pd.DataFrame = pd.read_parquet(pth)
    df_pd["geometry"] = df_pd["geometry_wkb"].apply(wkb.loads)  # type: ignore
    df_pd = df_pd.drop(columns=["geometry_wkb"])
    satellite_gdf = gpd.GeoDataFrame(df_pd, geometry="geometry", crs="EPSG:4326")
    if not satellite_gdf.geometry.is_valid.all():
        print("orig", pth)
    proj_gdf = satellite_gdf.to_crs(orig_crs)
    if not proj_gdf.geometry.is_valid.all():
        print("proj", pth)

In [None]:
pth = '/Users/kyledorman/data/planet_coverage/points_30km/skysat/results/2019/00/40/10/data.parquet'
df_pd: pd.DataFrame = pd.read_parquet(pth)
df_pd["geometry"] = df_pd["geometry_wkb"].apply(wkb.loads)  # type: ignore
satellite_gdf = gpd.GeoDataFrame(df_pd, geometry="geometry", crs="EPSG:4326")

satellite_gdf.geometry = satellite_gdf.geometry.make_valid()
proj_gdf = satellite_gdf.to_crs(orig_crs)
valid = proj_gdf.geometry.is_valid

proj_gdf.geometry = proj_gdf.geometry.make_valid()

m = folium.Map(
    location=[satellite_gdf.geometry.centroid.y.mean(), satellite_gdf.geometry.centroid.x.mean()], 
    zoom_start=4, 
    tiles="CartoDB positron",
    width=1000,
    height=600
)

for _, row in proj_gdf.to_crs(satellite_gdf.crs)[~valid].iterrows():
    folium.GeoJson(
        row.geometry,
    ).add_to(m)

m
    

In [None]:
BASE = Path("/Users/kyledorman/data/planet_coverage/ca_only/")  # <-- update this

In [None]:
ca_ocean = gpd.read_file(BASE / "ca_ocean.geojson")
orig_crs = gpd.read_file(BASE / "ocean_grids.gpkg").crs
query_df = gpd.read_file(BASE / "ocean_grids.gpkg").to_crs(ca_ocean.crs)
grids_df = gpd.read_file(BASE / "coastal_grids.gpkg").to_crs(ca_ocean.crs)

query_ca = query_df[query_df.geometry.intersects(ca_ocean.union_all())]

grids_ca = grids_df[grids_df.geometry.intersects(query_ca.union_all())]

inter_df = gpd.read_file(BASE / "coastal_skysat_dove_intersections.gpkg").to_crs(ca_ocean.crs)
inter_df['acquired_delta_minutes'] = (inter_df.acquired_delta_sec / 60).abs()
inter_df['acquired_delta_hours'] = (inter_df.acquired_delta_sec / 60 / 60).abs()
inter_df['dove_tide_height_abs'] = inter_df.dove_tide_height.abs()

inter_df_10min = inter_df[inter_df.acquired_delta_minutes < 10]

inter_df = inter_df.sort_values(by=["dove_id", "skysat_id", "cell_id"]).drop_duplicates(subset=["dove_id", "skysat_id"])
inter_df_10min = inter_df_10min.sort_values(by=["dove_id", "skysat_id", "cell_id"]).drop_duplicates(subset=["dove_id", "skysat_id"])

inter_df["pair_key"] = list(zip(inter_df["dove_id"], inter_df["skysat_id"]))
inter_df = inter_df.set_index("pair_key", drop=True)
assert not inter_df.index.duplicated().any(), "Composite key isn’t unique!"

inter_df_10min["pair_key"] = list(zip(inter_df_10min["dove_id"], inter_df_10min["skysat_id"]))
inter_df_10min = inter_df_10min.set_index("pair_key", drop=True)
assert not inter_df_10min.index.duplicated().any(), "Composite key isn’t unique!"

inter_df.head(5)

In [None]:
def plot_df(df, column_name, title, zoom=7, show_grids: bool = True):
    # --- Folium map for % ---
    if df[column_name].max() == df[column_name].min():
        scale_min = 0
    else:
        scale_min = df[column_name].min()
    color_scale = linear.viridis.scale(scale_min, df[column_name].max())
    
    m = folium.Map(
        location=[df.geometry.centroid.y.mean(), df.geometry.centroid.x.mean()], 
        zoom_start=zoom, 
        tiles="CartoDB positron",
        width=1000,
        height=600
    )

    if show_grids:
        for _, row in grids_ca.iterrows():
            folium.GeoJson(
                row.geometry,
                tooltip=str(row["cell_id"]),
                style_function=lambda feature: {
                    "color": "blue",
                    "weight": 1,
                }
            ).add_to(m)

    for grid_id, row in df.iterrows():
        value = row[column_name]
        geom = row.geometry
        folium.GeoJson(
            data=geom,
            style_function=lambda f, col=color_scale(value): {
                "fillColor": col,
                "color":     col,      # outline same as fill
                "weight":    1,
                "fillOpacity": 0.1,
            },
            tooltip=f"{grid_id}<br>{column_name}: {value:0.1f}",
        ).add_to(m)
    
    color_scale.caption = title
    color_scale.add_to(m)
    
    return m

In [None]:
limit_df = inter_df.sort_values(by=["skysat_id", "overlap_area"], ascending=False).drop_duplicates(subset=["skysat_id"])

limit_df_10min = inter_df_10min.sort_values(by=["skysat_id", "overlap_area"], ascending=False).drop_duplicates(subset=["skysat_id"])

print(len(limit_df), len(inter_df))
print(len(limit_df_10min), len(inter_df_10min))

In [None]:
limit_df.head(5)

In [None]:
plot_df(limit_df, 'acquired_delta_hours', 'acquired_delta_hours', show_grids=False)

In [None]:
plot_df(limit_df_10min, 'acquired_delta_hours', 'acquired_delta_hours', show_grids=False)

In [None]:
coverage = limit_df.to_crs(orig_crs).union_all().simplify(1000, preserve_topology=True)
coverage_df = gpd.GeoDataFrame(geometry=[coverage], crs=orig_crs).to_crs(ca_ocean.crs)

coverage_df

In [None]:
m = folium.Map(
    location=[coverage_df.geometry.iloc[0].centroid.y, coverage_df.geometry.iloc[0].centroid.x], 
    zoom_start=5, 
    tiles="CartoDB positron",
    width=1000,
    height=600
)
folium.GeoJson(
    data=coverage_df.geometry.iloc[0],
).add_to(m)
    
m

In [None]:
count_df = grids_ca.rename(columns={"cell_id": "grid_id"})[["grid_id", "geometry"]].merge(
    limit_df.groupby('grid_id').acquired_delta_sec.count(), on=["grid_id"], how="inner"
).rename(columns={'acquired_delta_sec': 'counter'})

# count_df.counter = count_df.counter.clip(0, 5)

count_df

In [None]:
plot_df(count_df, 'counter', 'per_grid_counts', show_grids=False)

In [None]:
limit_df.dove_tide_height.hist()