# DuckDB + Parquet Data Exploration Template

In [None]:
import duckdb
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import folium
from branca.colormap import linear
from pathlib import Path
from tqdm import tqdm
import json
from IPython.display import display
import shapely

# --- Configuration ---

In [None]:
BASE = Path("/Users/kyledorman/data/planet_coverage/ca_only/")  # <-- update this

# Example path patterns
f_pattern = "*/coastal_results/*/*/*/coastal_points.parquet"
all_files_pattern = str(BASE / f_pattern)

# Combined list used later when we search individual files
all_parquets = list(BASE.glob(f_pattern))

IDX = 1
hex_id = f"{IDX:06x}"  # unique 6‑digit hex, e.g. '0f1a2b'
d1, d2, d3 = hex_id[:2], hex_id[2:4], hex_id[4:6]
GRID_PATH = BASE / "dove" / "coastal_results" / d1 / d2 / d3
FILE = GRID_PATH / "coastal_points.parquet"

assert FILE.exists()

len(all_parquets)

In [None]:
# Create the base map centered on the calculated location
ca_ocean = gpd.read_file(BASE / "ca_ocean.geojson")
query_df = gpd.read_file(BASE / "ocean_grids.gpkg").to_crs(ca_ocean.crs)
grids_df = gpd.read_file(BASE / "coastal_grids.gpkg").to_crs(ca_ocean.crs).rename(columns={"cell_id": "grid_id"})

query_ca = query_df[query_df.geometry.intersects(ca_ocean.union_all())]
grids_ca = grids_df[grids_df.geometry.intersects(query_ca.union_all())]

len(grids_ca), len(query_ca)

In [None]:
# Create the base map centered on the calculated location
x = grids_df.geometry.centroid.x.mean()
y = grids_df.geometry.centroid.y.mean()
base_map = folium.Map(location=[y, x], zoom_start=4, width=1000, height=800)

# Add each GeoJSON file to the map
# Add polygons to the map
for _, row in grids_df.iterrows():
    pt = row["geometry"].centroid
    cell_id = row["cell_id"]
    folium.CircleMarker(
        location=[pt.y, pt.x],
        radius=0.5,
        color="red",
        fill=True,
        fill_opacity=0.3,
        popup=str(cell_id),
    ).add_to(base_map)

for _, row in grids_df.iterrows():
    pt = row["geometry"].centroid
    cell_id = row["cell_id"]
    folium.CircleMarker(
        location=[pt.y, pt.x],
        radius=0.5,
        color="red",
        fill=True,
        fill_opacity=0.3,
        popup=str(cell_id),
    ).add_to(base_map)

# Display the map
base_map

In [None]:
# --- Connect to DuckDB ---
con = duckdb.connect()

In [None]:
# Register a view for all files
con.execute(
    f"""
    CREATE OR REPLACE VIEW samples_all AS
    SELECT * FROM read_parquet('{all_files_pattern}');
"""
)

In [None]:
# Register a view for a single file for faster iteration
con.execute(f"""
    CREATE OR REPLACE VIEW samples_one AS
    SELECT * FROM '{FILE}'
""")

In [None]:
def plot_df(df, column_name, title, radius=6):
    # --- Folium map for % ---
    if df[column_name].max() == df[column_name].min():
        scale_min = 0
    else:
        scale_min = df[column_name].min()
    color_scale = linear.viridis.scale(scale_min, df[column_name].max())
    
    m = folium.Map(
        location=[df.geometry.centroid.y.mean(), df.geometry.centroid.x.mean()], 
        zoom_start=5, 
        tiles="CartoDB positron",
        width=1000,
        height=600
    )
    
    for _, row in df.iterrows():
        value = row[column_name]
        centroid = row.geometry.centroid
        folium.CircleMarker(
            location=[centroid.y, centroid.x],
            radius=radius,
            fill=True,
            fill_opacity=0.7,
            color=None,
            fill_color=color_scale(value),
            popup=f"Grid ID: {row.cell_id}<br>{column_name}: {value:.2f}"
        ).add_to(m)
    
    color_scale.caption = title
    color_scale.add_to(m)
    
    return m

In [None]:
# --- Load Geo Points and Join ---

geo_plot = gpd.read_file("../extracted/skysat_sample_count.gpkg")
plot_df(geo_plot[geo_plot.sample_count > 1.0], "sample_count", "Sample Count SkySat", 3)


In [None]:
# --- Load Geo Points and Join ---

geo_plot_sample_count_dove = gpd.read_file("../extracted/dove_sample_count.gpkg")

plot_df(geo_plot_sample_count_dove[geo_plot_sample_count_dove.sample_count > 1.0], "sample_count", "Sample Count PSScene", 3)

In [None]:
# Set up a 2×2 grid of histograms
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
axes = axes.flatten()


# 2. Plotting all four angle columns
for ax, col in zip(axes, ["satellite_azimuth", "sun_azimuth", "sun_elevation", "view_angle"]):
    df_hist = pd.read_csv(f"../extracted/hist_{col}.csv")

    ax.bar(df_hist["centers"], df_hist['count'], width=df_hist["widths"] * 0.9)
    ax.set_title(f"Histogram of {col.replace('_',' ').title()}")
    ax.set_xlabel(col.replace('_',' ').title())
    ax.set_ylabel("Frequency")
    
fig.tight_layout()
plt.show()

In [None]:
yes_samples = geo_plot_sample_count_dove.sample_count > 1.0

In [None]:
# --- Fraction of finalized per Grid Point ---

geo_plot = gpd.read_file("../extracted/publishing_stage_pct.gpkg")
plot_df(geo_plot[yes_samples], "frac_publishing_stage", "Fraction of Preview Observations", 3)

In [None]:
# --- Fraction analysis ready data per Grid Point ---

geo_plot = gpd.read_file("../extracted/pct_ground_control.gpkg")
plot_df(geo_plot[yes_samples], "frac_ground_control", "Fraction of ground_control Observations", 3)

In [None]:
GRID_ID = 634
hex_id = f"{GRID_ID:06x}"  # unique 6‑digit hex, e.g. '0f1a2b'
d1, d2, d3 = hex_id[:2], hex_id[2:4], hex_id[4:6]
one_grid_pattern = f'/Users/kyledorman/data/planet_coverage/points_30km/*/results/2023/{d1}/{d2}/{d3}/ocean.parquet'

# Register a view for all files
con.execute(
    f"""
    CREATE OR REPLACE VIEW samples_grid AS
    SELECT * FROM read_parquet('{one_grid_pattern}');
"""
)

In [None]:
import polars as pl

GRID_ID = 634
hex_id = f"{GRID_ID:06x}"  # unique 6‑digit hex, e.g. '0f1a2b'
d1, d2, d3 = hex_id[:2], hex_id[2:4], hex_id[4:6]

for s in ["dove", "skysat"]:
    for y in ["2024", "2023"]:
        GRID_PATH = BASE / s / "results" / y / d1 / d2 / d3
        file = GRID_PATH / "ocean.parquet"
        if not file.exists():
            print(s, y, "None")
            continue
        df = pl.read_parquet(file)
        print(s, y, len(df))
        

In [None]:
# --- Schema Inspection ---
print("Schema of samples_grid:")
df = con.execute("DESCRIBE samples_grid").fetchdf()
print(df)

cols = ['cell_id', 'item_type', 'instrument', 'has_8_channel ', 'has_sr_asset', 'quality_category', 'ground_control', 'publishing_stage']
for col in cols:
    df = con.execute(f"SELECT DISTINCT {col} from samples_grid").fetchdf()
    display(df.head())

In [None]:
import polars as pl
from src.create_ocean_df import SCHEMA

# how many minutes tolerance
n = 10

query = f"""
SELECT
  a.cell_id,
  a.acquired,
  a.clear_percent,
  a.quality_category,
  a.publishing_stage,
  a.geometry_wkb,
  b.acquired         AS ps_acquired,
  b.clear_percent    AS ps_clear_percent,
  b.quality_category AS ps_quality_category,
  b.publishing_stage AS ps_publishing_stage,
  b.geometry_wkb     AS ps_geometry_wkb,
FROM samples_grid AS a
INNER JOIN samples_grid AS b
  ON a.item_type = 'SkySatCollect'
  AND b.item_type = 'PSScene'
  -- within n minutes before or after
  AND b.acquired BETWEEN
        a.acquired - INTERVAL '{n}' MINUTE
    AND a.acquired + INTERVAL '{n}' MINUTE
"""

result = con.execute(query).fetchdf()
result["geometry"] = result["geometry_wkb"].apply(lambda b: shapely.wkb.loads(bytes(b)))
result["ps_geometry"] = result["ps_geometry_wkb"].apply(lambda b: shapely.wkb.loads(bytes(b)))

print(len(result))

# result now has every SkySatCollect row plus the matched PSScene cols
result.head()

In [None]:
gdf = gpd.GeoDataFrame(result, geometry="geometry", crs="EPSG:4326")
ps_gdf = gpd.GeoDataFrame(result, geometry="ps_geometry", crs="EPSG:4326")

x = gdf.geometry.centroid.x.mean()
y = gdf.geometry.centroid.y.mean()
base_map = folium.Map(location=[y, x], zoom_start=5, width=1000, height=800)

# Add each GeoJSON file to the map
# Add polygons to the map
for _, row in result.iterrows():
    folium.GeoJson(
        row["geometry"],
        style_function=lambda feature: {
            "color": "green",
            "weight": 2,
        },
    ).add_to(base_map)
    folium.GeoJson(
        row["ps_geometry"],
        style_function=lambda feature: {
            "color": "red",
            "weight": 2,
        },
    ).add_to(base_map)

# Display the map
base_map