In [None]:
# Analyze before and after Basic Data Cleaner

import os
import sqlite3
import pandas as pd
from datetime import datetime

# ================== CONFIGURE HERE ==================
# Adjust this path to where your unified_database.db is located
DB_PATH = r"Output\database\unified_database.db"
# ====================================================

if not os.path.exists(DB_PATH):
    raise FileNotFoundError(f"Database not found at: {DB_PATH}")

# Database connection
conn = sqlite3.connect(DB_PATH)

# ---- 1. Inspect available tables ----
tables_df = pd.read_sql(
    "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;",
    conn
)
tables = [t[0] for t in tables_df.values]
print("Tables found in database:")
print(tables_df)

# ---- 2. Count rows for key tables ----
def get_rowcount(connection, table_name):
    try:
        df = pd.read_sql(f"SELECT COUNT(*) AS n FROM {table_name}", connection)
        return int(df["n"].iloc[0])
    except Exception:
        return None

row_counts = {}
for table in ["trajs", "waypoint", "SegmentId_to_link"]:
    if table in tables:
        row_counts[table] = get_rowcount(conn, table)
    else:
        row_counts[table] = None

print("\nRow counts:")
for t, n in row_counts.items():
    print(f"  {t}: {n if n is not None else 'DOES NOT EXIST'}")

# ---- 3. Export cleaned tables to CSV ----
exported_files = []

for table in ["trajs", "waypoint"]:
    if table in tables:
        df = pd.read_sql(f"SELECT * FROM {table}", conn)
        out_name = f"{table}_cleaned.csv"
        df.to_csv(out_name, index=False)
        exported_files.append(out_name)
        print(f"\nTable '{table}' exported to: {out_name}")
    else:
        print(f"\nTable '{table}' does not exist, CSV was not exported.")

# ---- 4. Basic statistics ----
stats = {}

# 4a) For trajs: ErrorCodes distribution (if column exists)
if "trajs" in tables:
    df_trajs = pd.read_sql("SELECT * FROM trajs", conn)
    cols_trajs = df_trajs.columns.tolist()
    stats["trajs"] = {}

    if "ErrorCodes" in cols_trajs:
        error_counts = df_trajs["ErrorCodes"].fillna("").value_counts()
        n_total = len(df_trajs)
        n_clean = error_counts.get("", 0)
        pct_clean = 100 * n_clean / n_total if n_total > 0 else 0

        stats["trajs"]["total_rows"] = n_total
        stats["trajs"]["rows_without_errorcodes"] = n_clean
        stats["trajs"]["pct_without_errorcodes"] = pct_clean
        stats["trajs"]["errorcodes_distribution"] = error_counts.to_dict()
    else:
        stats["trajs"]["note"] = "Column 'ErrorCodes' does not exist in trajs."

# 4b) For waypoint: fuzzed_point distribution (if exists)
if "waypoint" in tables:
    df_wp = pd.read_sql("SELECT * FROM waypoint", conn)
    cols_wp = df_wp.columns.tolist()
    stats["waypoint"] = {}

    if "fuzzed_point" in cols_wp:
        fuzz_counts = df_wp["fuzzed_point"].fillna("").value_counts()
        n_total = len(df_wp)
        n_fuzzed = fuzz_counts.get("1", 0)
        pct_fuzzed = 100 * n_fuzzed / n_total if n_total > 0 else 0

        stats["waypoint"]["total_rows"] = n_total
        stats["waypoint"]["fuzzed_rows"] = n_fuzzed
        stats["waypoint"]["pct_fuzzed"] = pct_fuzzed
        stats["waypoint"]["fuzzed_distribution"] = fuzz_counts.to_dict()
    else:
        stats["waypoint"]["note"] = "Column 'fuzzed_point' does not exist in waypoint."

conn.close()

# ---- 5. Generate Markdown report ----
report_lines = []
report_lines.append(f"# Data Cleaning Report")
report_lines.append("")
report_lines.append(f"- Generated on: {datetime.now().isoformat(timespec='seconds')}")
report_lines.append(f"- Database: `{DB_PATH}`")
report_lines.append("")

report_lines.append("## Detected Tables")
for t in tables:
    report_lines.append(f"- {t}")
report_lines.append("")

report_lines.append("## Row counts per table")
for t, n in row_counts.items():
    if n is None:
        report_lines.append(f"- **{t}**: does not exist in database")
    else:
        report_lines.append(f"- **{t}**: {n} rows")
report_lines.append("")

# Section for trajs
if "trajs" in stats:
    report_lines.append("## Results for table `trajs`")
    st = stats["trajs"]
    if "total_rows" in st:
        report_lines.append(f"- Total rows: **{st['total_rows']}**")
        report_lines.append(f"- Rows without `ErrorCodes`: **{st['rows_without_errorcodes']}** "
                            f"({st['pct_without_errorcodes']:.2f}%)")
        report_lines.append("")
        report_lines.append("### `ErrorCodes` distribution")
        for k, v in st["errorcodes_distribution"].items():
            label = k if k != "" else "(empty / no error)"
            report_lines.append(f"- `{label}`: {v}")
    else:
        report_lines.append(f"- Note: {st.get('note', 'No statistics available.')}")
    report_lines.append("")

# Section for waypoint
if "waypoint" in stats:
    report_lines.append("## Results for table `waypoint`")
    st = stats["waypoint"]
    if "total_rows" in st:
        report_lines.append(f"- Total rows: **{st['total_rows']}**")
        report_lines.append(f"- Rows with `fuzzed_point = '1'`: **{st['fuzzed_rows']}** "
                            f"({st['pct_fuzzed']:.2f}%)")
        report_lines.append("")
        report_lines.append("### `fuzzed_point` distribution")
        for k, v in st["fuzzed_distribution"].items():
            label = k if k != "" else "(empty / NULL)"
            report_lines.append(f"- `{label}`: {v}")
    else:
        report_lines.append(f"- Note: {st.get('note', 'No statistics available.')}")
    report_lines.append("")

# CSV export note
report_lines.append("## Exported CSV files")
if exported_files:
    for f in exported_files:
        report_lines.append(f"- `{f}`")
else:
    report_lines.append("- No CSV files exported because the corresponding tables were not found.")
report_lines.append("")

# Save report
report_path = "data_cleaning_report.md"
with open(report_path, "w", encoding="utf-8") as f:
    f.write("\n".join(report_lines))

print(f"\nReport generated: {report_path}")
print("Open it in VS Code and use Ctrl+Shift+V to preview the Markdown document.")


In [None]:
# Visualization of Time Standardization before and after

import os
import sqlite3
import pandas as pd
from datetime import datetime

# ============================================
# 1. Locate project root
# ============================================
if "__file__" in globals():
    # Executed as a .py script
    PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "."))
else:
    # Executed from Jupyter / VS Code notebook
    PROJECT_ROOT = os.getcwd()

print("PROJECT_ROOT =", PROJECT_ROOT)

# Base paths
RAW_ROOT = os.path.join(PROJECT_ROOT, "data_cleaning_fusion_datasets")
DB_PATH = os.path.join(PROJECT_ROOT, "Output", "database", "unified_database.db")

if not os.path.exists(DB_PATH):
    raise FileNotFoundError(f"Processed database not found at: {DB_PATH}")

# ============================================
# 2. Map BEFORE (CSV) vs AFTER (DB)
# ============================================
entities = [
    {
        "name": "Readings",
        "before_path": os.path.join(RAW_ROOT, "tmc_speed", "Readings.csv"),
        "after_table": "Readings",
    },
    {
        "name": "trajs",
        "before_path": os.path.join(RAW_ROOT, "trip path", "trajs.csv"),
        "after_table": "trajs",
    },
    {
        "name": "waypoint",
        "before_path": os.path.join(RAW_ROOT, "waypoint", "waypoint.csv"),
        "after_table": "waypoint",
    },
]

# ============================================
# 3. Helper functions
# ============================================
def load_before_csv(path):
    if not os.path.exists(path):
        print(f"⚠️ BEFORE CSV not found: {path}")
        return None
    try:
        df = pd.read_csv(path)
        return df
    except Exception as e:
        print(f"Error reading CSV {path}: {e}")
        return None

def load_after_table(conn, table):
    try:
        df = pd.read_sql(f"SELECT * FROM {table}", conn)
        return df
    except Exception as e:
        print(f"Error reading table {table} from database: {e}")
        return None

def detect_time_columns(columns):
    cols = []
    for c in columns:
        cl = c.lower()
        if "time" in cl or "timestamp" in cl or "date" in cl:
            cols.append(c)
    return cols

# ============================================
# 4. Open processed database
# ============================================
conn = sqlite3.connect(DB_PATH)

# List available tables
tables_df = pd.read_sql(
    "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;",
    conn
)
available_tables = set(tables_df["name"].tolist())
print("Tables in processed database:", available_tables)

# ============================================
# 5. Comparison and report generation
# ============================================
report_lines = []
report_lines.append("# Time Standardization — Before & After\n")
report_lines.append(f"- Generation date: {datetime.now().isoformat(timespec='seconds')}")
report_lines.append(f"- Processed database (AFTER): `{DB_PATH}`\n")
report_lines.append("In this report, *BEFORE* refers to raw CSVs in `data_cleaning_fusion_datasets`,")
report_lines.append("and *AFTER* refers to the resulting tables inside `unified_database.db`.\n")

exported_files = []

for ent in entities:
    name = ent["name"]
    before_path = ent["before_path"]
    after_table = ent["after_table"]

    report_lines.append(f"## Entity: **{name}**\n")

    # BEFORE (CSV)
    df_before = load_before_csv(before_path)
    if df_before is None:
        report_lines.append(f"- ❌ Could not read BEFORE CSV at `{before_path}`.\n")
        report_lines.append("---\n")
        continue

    # AFTER (table in DB)
    if after_table not in available_tables:
        report_lines.append(f"- ❌ Table `{after_table}` does not exist in the processed database.\n")
        report_lines.append("---\n")
        continue

    df_after = load_after_table(conn, after_table)
    if df_after is None:
        report_lines.append(f"- ❌ Could not read table `{after_table}` from the database.\n")
        report_lines.append("---\n")
        continue

    # ---------------- Row counts ----------------
    n_before = len(df_before)
    n_after = len(df_after)
    report_lines.append("### Row counts")
    report_lines.append(f"- BEFORE (CSV): **{n_before}** rows")
    report_lines.append(f"- AFTER  (DB): **{n_after}** rows\n")

    # ---------------- Common columns ----------------
    common_cols = sorted(list(set(df_before.columns).intersection(df_after.columns)))
    report_lines.append(f"### Common columns ({len(common_cols)})")
    if common_cols:
        report_lines.append(", ".join(common_cols) + "\n")
    else:
        report_lines.append("No common columns between BEFORE and AFTER.\n")
        report_lines.append("---\n")
        continue

    # ---------------- Time columns ----------------
    time_cols = detect_time_columns(common_cols)
    if time_cols:
        report_lines.append("### Time column statistics (ranges)")
        for col in time_cols:
            try:
                bmin, bmax = df_before[col].min(), df_before[col].max()
                amin, amax = df_after[col].min(), df_after[col].max()
                report_lines.append(f"- `{col}`:")
                report_lines.append(f"   - BEFORE: min = {bmin}, max = {bmax}")
                report_lines.append(f"   - AFTER : min = {amin}, max = {amax}")
            except Exception as e:
                report_lines.append(f"- `{col}`: error computing ranges ({e})")
        report_lines.append("")
    else:
        report_lines.append("### Time columns")
        report_lines.append("No time-related columns detected.\n")

    # ---------------- Nulls ----------------
    report_lines.append("### Null values (common columns only)")
    for col in common_cols:
        nb = df_before[col].isna().sum()
        na = df_after[col].isna().sum()
        if nb > 0 or na > 0:
            report_lines.append(f"- `{col}`: BEFORE = {nb} nulls, AFTER = {na} nulls")
    report_lines.append("")

    # ---------------- Duplicates ----------------
    report_lines.append("### Duplicate rows (exact matches)")
    dup_before = df_before.duplicated().sum()
    dup_after = df_after.duplicated().sum()
    report_lines.append(f"- BEFORE: {dup_before} duplicated rows")
    report_lines.append(f"- AFTER : {dup_after} duplicated rows\n")

    # ---------------- Export sample CSVs ----------------
    before_out = f"{name}_before_for_report.csv"
    after_out = f"{name}_after_for_report.csv"

    df_before.to_csv(before_out, index=False)
    df_after.to_csv(after_out, index=False)

    exported_files.append(before_out)
    exported_files.append(after_out)

    report_lines.append("### Exported files")
    report_lines.append(f"- `{before_out}` (raw BEFORE CSV)")
    report_lines.append(f"- `{after_out}` (AFTER table from DB)\n")
    report_lines.append("---\n")

conn.close()

# ============================================
# 6. Save Markdown report
# ============================================
REPORT_PATH = os.path.join(PROJECT_ROOT, "time_standardization_before_after.md")
with open(REPORT_PATH, "w", encoding="utf-8") as f:
    f.write("\n".join(report_lines))

print("\nReport generated:")
print(" ", REPORT_PATH)
print("\nExported CSV files:")
for f in exported_files:
    print(" ", f)

print("\nTo preview the report in VS Code:")
print(" 1) Open 'time_standardization_before_after.md'")
print(" 2) Press: Ctrl + Shift + V (Open Preview)")
