In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from matplotlib.patches import Patch

# =====================================================
# 1. Define columns to evaluate in each source
#    (adjust names if they differ in your DB)
# =====================================================

wp_cols = [
    "journey_id",
    "capture_time",
    "latitude",
    "longitude",
    "speed_mph",
    "local_time",
]

tp_cols = [
    "SegmentId",
    "CrossingStartDateLocal",
    "CrossingEndDateLocal",
    "CrossingSpeedMph",
]

# =====================================================
# 2. Completeness function (% of non-null values)
# =====================================================

def completeness(series):
    return series.notna().mean() * 100.0

records = []

# Waypoint (cleaned / map-matched in DB)
for col in wp_cols:
    if col not in df_wp.columns:
        print(f"WARNING: column {col} does not exist in df_wp")
        continue
    comp = completeness(df_wp[col])
    records.append({"column": col, "source": "Waypoint", "completeness": comp})

# Trajs (cleaned + mapped in DB)
for col in tp_cols:
    if col not in df_tp.columns:
        print(f"WARNING: column {col} does not exist in df_tp")
        continue
    comp = completeness(df_tp[col])
    records.append({"column": col, "source": "Trajs", "completeness": comp})

df_comp = pd.DataFrame(records)

# Column order in the plot (similar to your colleague's figure)
order = wp_cols + tp_cols
df_comp["column"] = pd.Categorical(df_comp["column"], categories=order, ordered=True)
df_comp = df_comp.sort_values("column")

# =====================================================
# 3. Plot completeness bars
# =====================================================

plt.figure(figsize=(10, 6))

x = np.arange(len(df_comp))
heights = df_comp["completeness"].values

# Colors by source
colors = df_comp["source"].map({
    "Waypoint": "lightcoral",
    "Trajs": "goldenrod",
}).values

bars = plt.bar(x, heights, color=colors)

# Threshold line (e.g., 99%)
threshold = 99.0
plt.axhline(threshold, color="red", linestyle="--", linewidth=1, label="99% Threshold")

plt.ylim(95, 101)  # same style as your colleague's figure
plt.ylabel("Completeness (%)")
plt.xlabel("Column Name")
plt.title("Data Completeness by Source and Column")

plt.xticks(x, df_comp["column"], rotation=45, ha="right")

# Manual legend for the sources
legend_handles = [
    Patch(facecolor="lightcoral", label="Waypoint"),
    Patch(facecolor="goldenrod", label="Trajs"),
    Patch(facecolor="none", edgecolor="red", linestyle="--", label="99% Threshold")
]
plt.legend(handles=legend_handles, loc="lower left")

plt.tight_layout()

# Save figure
PROJECT_ROOT = os.getcwd()
out_dir = os.path.join(PROJECT_ROOT, "figures_from_tool_db_only")
os.makedirs(out_dir, exist_ok=True)
out_path = os.path.join(out_dir, "data_completeness_waypoint_trajs.png")
plt.savefig(out_path, dpi=300)
plt.show()

print("Figure saved to:", out_path)
