In [None]:
# Cleaning Performance Metrics (Spatial Validity, Error Codes, Fuzzed Points, Timestamps)

import os
import sqlite3
import pandas as pd

# ------------------------------------
# 1. Load database
# ------------------------------------
DB_PATH = r"Output\database\unified_database.db"

if not os.path.exists(DB_PATH):
    raise FileNotFoundError(f"Database not found at: {DB_PATH}")

conn = sqlite3.connect(DB_PATH)

# Helper
def count(query):
    return pd.read_sql(query, conn).iloc[0,0]

# ------------------------------------
# 2. Spatial Validity (trajs)
# ------------------------------------
# How many trajs have a SegmentId that exists in SegmentId_to_link?
valid_trajs = pd.read_sql("""
SELECT COUNT(*) AS n 
FROM trajs 
WHERE SegmentId IN (SELECT SegmentId FROM SegmentId_to_link)
""", conn).iloc[0,0]

total_trajs = count("SELECT COUNT(*) FROM trajs")

spatial_validity_pct = (valid_trajs / total_trajs * 100) if total_trajs else 0

# ------------------------------------
# 3. Error-coded row removal (trajs)
# ------------------------------------
error_rows = count("""
SELECT COUNT(*) 
FROM trajs 
WHERE ErrorCodes IS NOT NULL AND ErrorCodes <> ''
""")

clean_rows = total_trajs - error_rows
error_clean_pct = (clean_rows / total_trajs * 100) if total_trajs else 0

# ------------------------------------
# 4. Fuzzed-point filtering (waypoint)
# ------------------------------------
total_wp = count("SELECT COUNT(*) FROM waypoint")
fuzzed_wp = count("""
SELECT COUNT(*) 
FROM waypoint 
WHERE fuzzed_point = '1'
""")
fuzzed_pct = (fuzzed_wp / total_wp * 100) if total_wp else 0

# ------------------------------------
# 5. Timestamp consistency check
# Count how many timestamp columns exist and check NULLs
# ------------------------------------
df_trajs = pd.read_sql("SELECT * FROM trajs LIMIT 5", conn)
df_wp = pd.read_sql("SELECT * FROM waypoint LIMIT 5", conn)
df_read = pd.read_sql("SELECT * FROM Readings LIMIT 5", conn)
df_lane = pd.read_sql("SELECT * FROM lane_readings LIMIT 5", conn)

def inspect_timestamps(df):
    time_cols = [c for c in df.columns if "time" in c.lower() or "date" in c.lower()]
    return time_cols

ts_trajs = inspect_timestamps(df_trajs)
ts_wp = inspect_timestamps(df_wp)
ts_read = inspect_timestamps(df_read)
ts_lane = inspect_timestamps(df_lane)

# Summary
print("\n=== Cleaning Performance Metrics ===")
print(f"Spatial validity: {spatial_validity_pct:.2f}% ({valid_trajs}/{total_trajs})")
print(f"Rows without ErrorCodes: {clean_rows}/{total_trajs} ({error_clean_pct:.2f}%)")
print(f"Fuzzed points removed: {fuzzed_wp}/{total_wp} ({fuzzed_pct:.2f}%)")
print("\nTimestamp columns detected:")
print("  trajs:", ts_trajs)
print("  waypoint:", ts_wp)
print("  Readings:", ts_read)
print("  lane_readings:", ts_lane)

conn.close()