# DuckDB + Parquet Data Exploration Template

In [5]:
import duckdb
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import folium
from folium.plugins import MarkerCluster
from branca.colormap import linear
from pathlib import Path
from shapely.geometry import box
import cartopy.io.shapereader as shpreader
from tqdm import tqdm
import json
from IPython.display import display

from src.util import create_config

# --- Configuration ---

In [7]:
BASE = Path("/Users/kyledorman/data/planet_coverage/ca_only")  # <-- update this
config_file = BASE / "dove" / "config.yaml"  # <-- update this
config = create_config(config_file)

# Example path patterns
f_pattern = "*/coastal_results/*/*/*/coastal_points.parquet"
all_files_pattern = str(BASE / f_pattern)

# Combined list used later when we search individual files
all_parquets = list(BASE.glob(f_pattern))

len(all_parquets)

59

In [9]:
ca_ocean = gpd.read_file(BASE.parent / "shorelines" / "ca_ocean.geojson")
ca_ocean.crs

<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: World.
- bounds: (-180.0, -90.0, 180.0, 90.0)
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

In [None]:
# Create the base map centered on the calculated location
ca_ocean = gpd.read_file(BASE / "ca_ocean.geojson")
query_df = gpd.read_file(BASE / "ocean_grids.gpkg").to_crs(ca_ocean.crs)
grids_df = gpd.read_file(BASE / "coastal_grids.gpkg").to_crs(ca_ocean.crs).rename(columns={"cell_id": "grid_id"})

query_ca = query_df[query_df.geometry.intersects(ca_ocean.union_all())]

grids_ca = grids_df[grids_df.geometry.intersects(ca_ocean.union_all())]

len(grids_ca), len(query_ca)

In [None]:
# Create the base map centered on the calculated location
centroid = ca_ocean.geometry[0].centroid
base_map = folium.Map(location=[centroid.y, centroid.x], zoom_start=5, width=1000, height=800)

for idx, geo in enumerate(ca_ocean.geometry):
    folium.GeoJson(
        geo,
        name=str(idx),
        style_function=lambda feature: {
            "color": "red",
            "weight": 4,
        }
    ).add_to(base_map)

# # Add each GeoJSON file to the map
# # Add polygons to the map
for _, row in query_ca.iterrows():
    folium.GeoJson(
        row.geometry,
        popup=str(row["cell_id"]),
        style_function=lambda feature: {
            "color": "blue",
            "weight": 2,
        }
    ).add_to(base_map)

for _, row in grids_ca.iterrows():
    folium.GeoJson(
        row.geometry,
        popup=str(row["grid_id"]),
        style_function=lambda feature: {
            "color": "green",
            "weight": 1,
        }
    ).add_to(base_map)

# Display the map
base_map

In [None]:
# --- Connect to DuckDB ---
con = duckdb.connect()

In [None]:
FILE = None

for seach_index in grids_ca.grid_id:
    if FILE is not None:
        break
    for file in all_parquets:
        result = con.execute(
            f"SELECT COUNT(*) FROM read_parquet('{file}') WHERE grid_id = {seach_index}"
        ).fetchone()
    
        if len(result) and result[0] > 0:
            print(f"Found grid_id {seach_index} in: {file}")
            FILE = file
            break

In [None]:
# Register a view for all files
con.execute(
    f"""
    CREATE OR REPLACE VIEW samples_all AS
    SELECT * FROM read_parquet('{all_files_pattern}');
"""
)

In [None]:
# Register a view for a single file for faster iteration
con.execute(f"""
    CREATE OR REPLACE VIEW samples_one AS
    SELECT * FROM '{FILE}'
""")

In [None]:
# --- Schema Inspection ---
print("Schema of samples_one:")
df = con.execute("DESCRIBE samples_one").fetchdf()
print(df)

In [None]:
# --- NULL Check ---
print("Checking for NULL values:")
df_nulls = con.execute("""
    SELECT
        SUM(CASE WHEN id IS NULL THEN 1 ELSE 0 END) AS null_id,
        SUM(CASE WHEN acquired IS NULL THEN 1 ELSE 0 END) AS null_acquired,
        SUM(CASE WHEN item_type IS NULL THEN 1 ELSE 0 END) AS null_item_type,
        SUM(CASE WHEN satellite_id IS NULL THEN 1 ELSE 0 END) AS null_satellite_id,
        SUM(CASE WHEN instrument IS NULL THEN 1 ELSE 0 END) AS null_instrument,

        SUM(CASE WHEN cell_id IS NULL THEN 1 ELSE 0 END) AS null_cell_id,
        SUM(CASE WHEN grid_id IS NULL THEN 1 ELSE 0 END) AS null_grid_id,
        
        SUM(CASE WHEN has_8_channel IS NULL THEN 1 ELSE 0 END) AS null_has_8_channel,
        SUM(CASE WHEN has_sr_asset IS NULL THEN 1 ELSE 0 END) AS null_has_sr_asset,
        SUM(CASE WHEN clear_percent IS NULL THEN 1 ELSE 0 END) AS null_clear_percent,
        SUM(CASE WHEN quality_category IS NULL THEN 1 ELSE 0 END) AS null_quality_category,
        SUM(CASE WHEN ground_control IS NULL THEN 1 ELSE 0 END) AS null_ground_control,
        SUM(CASE WHEN publishing_stage IS NULL THEN 1 ELSE 0 END) AS null_publishing_stage,
        
        SUM(CASE WHEN satellite_azimuth IS NULL THEN 1 ELSE 0 END) AS null_satellite_azimuth,
        SUM(CASE WHEN sun_azimuth IS NULL THEN 1 ELSE 0 END) AS null_sun_azimuth,
        SUM(CASE WHEN sun_elevation IS NULL THEN 1 ELSE 0 END) AS null_sun_elevation,
        SUM(CASE WHEN view_angle IS NULL THEN 1 ELSE 0 END) AS null_view_angle,
        SUM(CASE WHEN coverage_pct IS NULL THEN 1 ELSE 0 END) AS null_coverage_pct,
    FROM samples_one
""").fetchdf()
df_nulls

In [None]:
cols = ['item_type', 'instrument', 'has_8_channel ', 'has_sr_asset', 'quality_category', 'ground_control', 'publishing_stage']
for col in cols:
    df = con.execute(f"SELECT DISTINCT {col} from samples_all").fetchdf()
    display(df.head())

In [None]:
cols = ['item_type', 'instrument', 'has_8_channel ', 'has_sr_asset', 'quality_category', 'ground_control', 'publishing_stage']
for col in cols:
    df = con.execute(f"SELECT DISTINCT {col} from samples_one").fetchdf()
    display(df.head())

In [None]:
# --- Preview Data ---
df_preview = con.execute("SELECT * FROM samples_one LIMIT 10").fetchdf()
display(df_preview)

In [None]:
# --- Count Rows ---
print("Total rows in sample file:")
print(con.execute("SELECT COUNT(*) FROM samples_one").fetchone()[0])

# --- Count Rows ---
print("Total rows all files:")
print(con.execute("SELECT COUNT(*) FROM samples_all").fetchone()[0])

In [None]:
# --- Filter By Time ---
df_2024 = con.execute("""
    SELECT *
    FROM samples_one
    WHERE acquired >= '2024-01-01' AND acquired < '2025-01-01'
    LIMIT 100
""").fetchdf()
display(df_2024)

In [None]:
def plot_df(df, column_name, title, radius=6):
    # --- Folium map for % ---
    if df[column_name].max() == df[column_name].min():
        scale_min = 0
    else:
        scale_min = df[column_name].min()
    color_scale = linear.viridis.scale(scale_min, df[column_name].max())
    
    m = folium.Map(
        location=[df.geometry.centroid.y.mean(), df.geometry.centroid.x.mean()], 
        zoom_start=5, 
        tiles="CartoDB positron",
        width=1000,
        height=600
    )
    
    for _, row in df.iterrows():
        value = row[column_name]
        centroid = row.geometry.centroid
        folium.CircleMarker(
            location=[centroid.y, centroid.x],
            radius=radius,
            fill=True,
            fill_opacity=0.7,
            color=None,
            fill_color=color_scale(value),
            popup=f"Grid ID: {row.name}<br>{column_name}: {value:.2f}"
        ).add_to(m)
    
    color_scale.caption = title
    color_scale.add_to(m)
    
    return m


def plot_bool_pct(column_name, bool_logic_str, merge_df, title, radius=6, nafill = 0.0):
    df_pct = con.execute(
        f"""
        SELECT grid_id,
               SUM({bool_logic_str})::DOUBLE  / COUNT(*) AS frac_{column_name}
        FROM samples_all
        WHERE item_type = 'PSScene'
        GROUP BY grid_id
    """
    ).fetchdf()

    geo_pct = merge_df.set_index("grid_id").join(df_pct.set_index("grid_id"), how="left").fillna({f"frac_{column_name}": nafill})

    return plot_df(geo_pct, f"frac_{column_name}", title, radius)

In [None]:
# --- Load Geo Points and Join ---

# Sample count per grid cell
df_counts = con.execute("""
    SELECT grid_id, COUNT(*) as sample_count
    FROM samples_all
    WHERE item_type = 'PSScene'
    GROUP BY grid_id
""").fetchdf()

geo_plot = grids_ca.set_index("grid_id").join(df_counts.set_index("grid_id"), how="left").fillna({"sample_count": 0})

plot_df(geo_plot, "sample_count", "Sample Count PSScene", 4)

In [None]:
# --- Load Geo Points and Join ---

# Sample count per grid cell
df_counts = con.execute("""
    SELECT grid_id, COUNT(*) as sample_count
    FROM samples_all
    WHERE item_type = 'PSScene'
    AND coverage_pct > 0.5
    AND publishing_stage = 'finalized'
    AND quality_category = 'standard'
    AND has_sr_asset
    AND ground_control
    GROUP BY grid_id
""").fetchdf()

geo_plot = grids_ca.set_index("grid_id").join(df_counts.set_index("grid_id"), how="left").fillna({"sample_count": 0})

plot_df(geo_plot, "sample_count", "High Quality Sample Count", radius=6)

In [None]:
# ---- Histogram Per Day Counts (w/Publish Stage) ---- #

# 1. Pull per-day counts broken out by stage
df_stage = con.execute("""
    SELECT
        date_trunc('day', acquired) AS day,
        publishing_stage,
        COUNT(DISTINCT id) AS cnt
    FROM samples_all
    WHERE item_type = 'PSScene'
    GROUP BY day, publishing_stage
    ORDER BY day
""").fetchdf()

# 2. Normalize day column and pivot so each stage is its own column
df_stage['day'] = pd.to_datetime(df_stage['day']).dt.date
df_pivot = (
    df_stage
    .pivot(index='day', columns='publishing_stage', values='cnt')
    .fillna(0)
)

# 3. Ensure a consistent stack order
stages = ['preview', 'finalized', 'standard']
df_pivot = df_pivot.reindex(columns=stages, fill_value=0)

# 4. Plot stacked bars
fig, ax = plt.subplots(figsize=(10, 4))
bottom = np.zeros(len(df_pivot))

for stage in stages:
    ax.bar(
        df_pivot.index.astype(str),
        df_pivot[stage],
        bottom=bottom,
        label=stage
    )
    bottom += df_pivot[stage].values

ax.set_xticklabels(df_pivot.index.astype(str), rotation=45, ha='right')
ax.set_title("Sample Count per Day by Publishing Stage")
ax.set_xlabel("Day")
ax.set_ylabel("Sample Count")
ax.legend(title="Publishing Stage")
fig.tight_layout()
plt.show()

In [None]:
def compute_minmax(column: str) -> pd.DataFrame:
    sql = f"""
        SELECT
        MIN({column}) AS minv,
        MAX({column}) AS maxv
        FROM samples_all
        WHERE item_type = 'PSScene'
    """
    return con.execute(sql).fetchdf()

def compute_histogram(column: str, nbins: int = 30) -> pd.DataFrame:
    """
    Runs DuckDB's histogram() table function on `column` in samples_all (filtered to PSScene)
    and returns a DataFrame with columns: bin_upper, frequency.
    """
    sql = f"""
        WITH bounds AS (
          SELECT
            MIN({column}) AS mn,
            MAX({column}) AS mx
          FROM samples_all
          WHERE item_type = 'PSScene'
        )
        SELECT
          -- histogram() returns a MAP<upper_boundary, count>
          histogram(
            {column},
            equi_width_bins(bounds.mn::DOUBLE, bounds.mx::DOUBLE, {nbins}::BIGINT, True)
          ) AS hist_map
        FROM samples_all
        CROSS JOIN bounds
        WHERE item_type = 'PSScene';
    """
    hist_map = con.execute(sql).fetchdf().iloc[0]["hist_map"]

    
    # Unpack into a two-column DataFrame
    df = pd.DataFrame({
        'bin_upper': list(hist_map.keys()),
        'count':     list(hist_map.values())
    })
    df = df.sort_values('bin_upper').reset_index(drop=True)
    uppers = df['bin_upper'].tolist()
    bin_size = uppers[1] - uppers[0]
    # Compute lower edge from previous upper
    lowest = uppers[0] - bin_size
    lowers = [lowest] + uppers[:-1]
    df["bin_lower"] = pd.Series(lowers)
    df["centers"] = (df["bin_lower"] + df['bin_upper']) / 2
    df["widths"]  = df['bin_upper'] - df["bin_lower"]
    return df

In [None]:
# Set up a 2×2 grid of histograms
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
axes = axes.flatten()


# 2. Plotting all four angle columns
for ax, col in zip(axes, ["satellite_azimuth", "sun_azimuth", "sun_elevation", "view_angle"]):
    df_hist = compute_histogram(col, nbins=30)

    ax.bar(df_hist["centers"], df_hist['count'], width=df_hist["widths"] * 0.9)
    ax.set_title(f"Histogram of {col.replace('_',' ').title()}")
    ax.set_xlabel(col.replace('_',' ').title())
    ax.set_ylabel("Frequency")
    
fig.tight_layout()
plt.show()

In [None]:
# --- Histogram of Sample Count per Month ---
df_monthly = con.execute("""
    SELECT date_trunc('month', acquired) AS month, COUNT(DISTINCT id) AS sample_count
    FROM samples_all
    WHERE item_type = 'PSScene'
    GROUP BY month
    ORDER BY month
""").fetchdf()

# Plot histogram
plt.figure(figsize=(10,4))
plt.bar(df_monthly['month'].astype(str), df_monthly['sample_count'])
plt.xticks(rotation=45, ha='right')
plt.title("Sample Count per Month (Unique Scenes)")
plt.xlabel("Month")
plt.ylabel("Sample Count")
plt.tight_layout()
plt.show()

In [None]:
# --- Histogram of clear_percent ---
fig, ax = plt.subplots(1, 1, figsize=(10, 4))

df_hist = compute_histogram("coverage_pct", nbins=10)

ax.bar(df_hist["centers"], df_hist['count'], width=df_hist["widths"] * 0.9)
ax.set_title(f"Histogram of {'coverage_pct'.title()}")
ax.set_xlabel("coverage_pct".title())
ax.set_ylabel("Frequency")
    
fig.tight_layout()
plt.show()

In [None]:
# --- Histogram of clear_percent ---
fig, ax = plt.subplots(1, 1, figsize=(10, 4))

df_hist = compute_histogram("clear_percent", nbins=30)

ax.bar(df_hist["centers"], df_hist['count'], width=df_hist["widths"] * 0.9)
ax.set_title(f"Histogram of {'clear_percent'.title()}")
ax.set_xlabel("clear_percent".title())
ax.set_ylabel("Frequency")
    
fig.tight_layout()
plt.show()

In [None]:
# --- Fraction of finalized per Grid Point ---
plot_bool_pct(
    'publishing_stage', 
    "publishing_stage = 'finalized'", 
    grids_ca, 
    "Fraction of Finalized Observations",
)

In [None]:
# --- Fraction of preview per Grid Point ---
plot_bool_pct(
    'publishing_stage', 
    "publishing_stage = 'preview'", 
    grids_ca,
    "Fraction of Preview Observations",
    nafill=0.0
)


In [None]:
# --- Fraction of has_8_channel per Grid Point ---
plot_bool_pct(
    'has_8_channel', 
    "has_8_channel", 
    grids_ca, 
    "Fraction of 8-Channel Observations",
    nafill=0.0,
)

In [None]:
# --- Fraction analysis ready data per Grid Point ---
plot_bool_pct(
    'has_sr_asset', 
    "has_sr_asset", 
    grids_ca, 
    "Fraction of Analysis Ready Observations",
    nafill=0.0,
)

In [None]:
# --- Fraction analysis ready data per Grid Point ---
plot_bool_pct(
    'ground_control', 
    "ground_control", 
    grids_ca, 
    "Fraction of ground_control Observations",
    nafill=0.0,
)

In [None]:
df_pct = con.execute(
    """
    SELECT grid_id,
           SUM(ground_control)::DOUBLE  / COUNT(*) AS frac_preview_gc
    FROM samples_all
    WHERE item_type = 'PSScene' AND publishing_stage = 'preview'
    GROUP BY grid_id
"""
).fetchdf()

display(df_pct[df_pct.frac_preview_gc > 0.5].head())

geo_pct = grids_ca.set_index("grid_id").join(df_pct.set_index("grid_id"), how="left").fillna({"frac_preview_gc": 0.5})

plot_df(
    geo_pct, 
    "frac_preview_gc", 
    "Fraction Preview Scenes w/Ground Control", 
)