In [16]:
import sys, os, json, time
from pathlib import Path

import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from deltalake import DeltaTable, write_deltalake

ROOT = Path(r"C:\engine_module_pipeline")
DELTA_SRC   = ROOT / r"delta\engine_module_delta"          # source Delta (with _delta_log)
INFER_READY = ROOT / r"delta\engine_module_infer_ready"    # TARGET (Delta)
CONFIG      = ROOT / "config"
ARTIFACTS   = ROOT / "engine_module_artifacts"

CHECKPOINT_DIR  = ROOT / r"data\checkpoints"
CHECKPOINT_DIR.mkdir(parents=True, exist_ok=True)
CHECKPOINT_PATH = CHECKPOINT_DIR / "infer_ready_shared_checkpoint.json"
LOCK_PATH       = CHECKPOINT_DIR / "infer_ready_shared.lock"
LOG_STATE_PATH  = CHECKPOINT_DIR / "infer_ready_log_state.json"   # incremental _delta_log tracker
DLQ_DIR         = ROOT / r"data\dlq\infer_ready"
DLQ_DIR.mkdir(parents=True, exist_ok=True)

print("Python:", sys.version.split()[0])
print("deltalake:", __import__("deltalake").__version__)
print("pyarrow:", pa.__version__)
print("Paths:")
print("  SRC _delta_log:", (DELTA_SRC / "_delta_log").exists(), DELTA_SRC / "_delta_log")
print("  TARGET (Delta):", INFER_READY)
print("  CHECKPOINT:", CHECKPOINT_PATH)
print("  LOG_STATE:", LOG_STATE_PATH)
print("  DLQ:", DLQ_DIR)


Python: 3.11.9
deltalake: 1.1.4
pyarrow: 16.1.0
Paths:
  SRC _delta_log: True C:\engine_module_pipeline\delta\engine_module_delta\_delta_log
  TARGET (Delta): C:\engine_module_pipeline\delta\engine_module_infer_ready
  CHECKPOINT: C:\engine_module_pipeline\data\checkpoints\infer_ready_shared_checkpoint.json
  LOG_STATE: C:\engine_module_pipeline\data\checkpoints\infer_ready_log_state.json
  DLQ: C:\engine_module_pipeline\data\dlq\infer_ready


In [17]:

import json
import pyarrow as pa
from deltalake import write_deltalake

FEATURES_JSON = ARTIFACTS / "features.json"
MAP_MERGED    = CONFIG / "mapping_merged.json"

if not FEATURES_JSON.exists():
    raise FileNotFoundError(f"Missing {FEATURES_JSON}")
if not MAP_MERGED.exists():
    raise FileNotFoundError(f"Missing {MAP_MERGED}")


features = json.loads(FEATURES_JSON.read_text(encoding="utf-8"))
if isinstance(features, dict) and "features" in features:
    features = features["features"]
assert isinstance(features, list) and len(features) == 25, "features.json must list exactly 25 canonical features"


mapping = json.loads(MAP_MERGED.read_text(encoding="utf-8"))
canonical_to_raw = {f: mapping[f]["raw_key"] for f in features}

def infer_ready_arrow_schema():
    """Arrow schema for infer-ready: timestamp(us), date=date32, feature float64."""
    fields = [
        pa.field("row_hash",    pa.string(),           nullable=False),
        pa.field("timestamp",   pa.timestamp("us"),    nullable=True),
        pa.field("source_id",   pa.string(),           nullable=True),
        pa.field("kafka_key",   pa.string(),           nullable=True),
        pa.field("offset",      pa.int64(),            nullable=True),
        pa.field("source_file", pa.string(),           nullable=True),
        pa.field("date",        pa.date32(),           nullable=False),  # true DATE (NOT NULL)
    ]
    for f in features:
        fields.append(pa.field(f, pa.float64(), nullable=True))
    return pa.schema(fields)

SCHEMA = infer_ready_arrow_schema()

def ensure_infer_ready_delta_exists():
    if (INFER_READY / "_delta_log").exists():
        return
    empty_tbl = pa.Table.from_arrays([pa.array([], type=field.type) for field in SCHEMA], schema=SCHEMA)
    write_deltalake(
        str(INFER_READY),
        empty_tbl,
        mode="overwrite",
        partition_by=["date"],
    )
    print("Created empty Delta table at:", INFER_READY)

ensure_infer_ready_delta_exists()
print("Delta table ready at:", INFER_READY)


Created empty Delta table at: C:\engine_module_pipeline\delta\engine_module_infer_ready
Delta table ready at: C:\engine_module_pipeline\delta\engine_module_infer_ready


In [18]:
import os, time, json
from deltalake import DeltaTable

def read_checkpoint():
    if not CHECKPOINT_PATH.exists():
        return {"processed_files": [], "last_updated": None}
    try:
        return json.loads(CHECKPOINT_PATH.read_text(encoding="utf-8"))
    except Exception:
        return {"processed_files": [], "last_updated": None}

def write_checkpoint_atomic(data: dict):
    tmp = CHECKPOINT_PATH.with_suffix(".tmp")
    tmp.write_text(json.dumps(data, indent=2), encoding="utf-8")
    os.replace(str(tmp), str(CHECKPOINT_PATH))

def read_log_state():
    if not LOG_STATE_PATH.exists():
        return {"last_log_file": None}
    try:
        return json.loads(LOG_STATE_PATH.read_text(encoding="utf-8"))
    except Exception:
        return {"last_log_file": None}

def write_log_state(state: dict):
    tmp = LOG_STATE_PATH.with_suffix(".tmp")
    tmp.write_text(json.dumps(state, indent=2), encoding="utf-8")
    os.replace(str(tmp), str(LOG_STATE_PATH))

def acquire_lock(timeout: int = 30) -> bool:
    start = time.time()
    while True:
        try:
            fd = os.open(str(LOCK_PATH), os.O_CREAT | os.O_EXCL | os.O_WRONLY)
            os.write(fd, str(os.getpid()).encode("utf-8"))
            os.close(fd)
            return True
        except FileExistsError:
            try:
                # stale lock cleanup (older than 1h)
                if time.time() - LOCK_PATH.stat().st_mtime > 3600:
                    LOCK_PATH.unlink(); continue
            except Exception:
                pass
            if time.time() - start > timeout:
                return False
            time.sleep(0.5)

def refresh_lock_heartbeat():
    try:
        if LOCK_PATH.exists():
            os.utime(LOCK_PATH, None)
    except Exception:
        pass

def release_lock():
    try:
        if LOCK_PATH.exists():
            LOCK_PATH.unlink()
    except Exception:
        pass

def dlq_write(rows: list, prefix: str):
    if not rows:
        return
    DLQ_DIR.mkdir(parents=True, exist_ok=True)
    ts = time.strftime("%Y%m%d_%H%M%S")
    fn = DLQ_DIR / f"{prefix}_{ts}.jsonl"
    with fn.open("w", encoding="utf-8") as fh:
        for r in rows:
            fh.write(json.dumps(r) + "\n")
    print("DLQ written:", fn.name, "count:", len(rows))

def existing_row_hashes_from_delta(limit=None) -> set:
    """Read distinct row_hash from Delta table (columns=['row_hash']). For scale, consider a key-index later."""
    if not (INFER_READY / "_delta_log").exists():
        return set()
    dt = DeltaTable(str(INFER_READY))
    tbl = dt.to_pyarrow_table(columns=["row_hash"])
    vals = tbl.column("row_hash").to_pylist()
    if limit is not None:
        vals = vals[:limit]
    return set(vals)


In [19]:
LOG_DIR = DELTA_SRC / "_delta_log"
if not LOG_DIR.exists():
    raise FileNotFoundError(f"Missing _delta_log at {LOG_DIR}")

def _list_json_logs():
    return sorted([p for p in LOG_DIR.iterdir() if p.is_file() and p.suffix == ".json" and not p.name.startswith(".")],
                  key=lambda p: p.name)

def discover_add_file_paths_incremental(last_log_file: str | None):
    logs = _list_json_logs()
    if last_log_file:
        logs = [p for p in logs if p.name > last_log_file]
    add_paths = []
    latest = last_log_file
    for jf in logs:
        latest = jf.name
        for line in jf.read_text(encoding="utf-8").splitlines():
            if not line.strip():
                continue
            try:
                obj = json.loads(line)
            except Exception:
                continue
            if isinstance(obj, dict) and "add" in obj and obj["add"].get("path"):
                add_paths.append(DELTA_SRC / obj["add"]["path"])
    # de-dupe keep order
    seen=set(); out=[]
    for p in add_paths:
        if p not in seen and p.exists():
            seen.add(p); out.append(p)
    return latest, out

def discover_add_file_paths_full():
    # Backfill convenience: scan the entire _delta_log
    return discover_add_file_paths_incremental(last_log_file=None)[1]

def _to_float_or_none(v):
    if v is None or v == "":
        return None
    try:
        return float(v)
    except Exception:
        try:
            return float(str(v).replace(",", ""))
        except Exception:
            return None

def to_canonical_rows(parquet_file: Path) -> tuple[list, list]:
    """
    Parse a source parquet into (ok_rows, dlq_rows).
    Expect columns: row_hash (string), timestamp, payload (JSON/bytes), kafka_key, offset, ...
    """
    ok_rows, dlq_rows = [], []
    try:
        pf = pq.ParquetFile(str(parquet_file))
    except Exception as e:
        dlq_rows.append({"source_file": str(parquet_file), "reason": f"parquet_open_failed:{e!s}"})
        return ok_rows, dlq_rows

    for rg in range(pf.num_row_groups):
        table = pf.read_row_group(rg)
        cols = {name: table.column(name).to_pylist() for name in table.column_names}
        n = len(next(iter(cols.values()))) if cols else 0

        for i in range(n):
            row_hash = (cols.get("row_hash") or [None])[i]
            ts_raw   = (cols.get("timestamp") or [None])[i]
            kafka_key = (cols.get("kafka_key") or [None])[i]
            offset    = (cols.get("offset") or [None])[i]
            payload   = (cols.get("payload") or [None])[i]

            if row_hash is None:
                dlq_rows.append({"reason":"missing_row_hash", "source_file": str(parquet_file)}); continue

            # Parse payload to dict
            parsed = {}
            if payload is not None:
                try:
                    parsed = json.loads(payload) if isinstance(payload, str) else json.loads(payload.decode("utf-8", errors="ignore"))
                except Exception:
                    dlq_rows.append({"row_hash": str(row_hash), "reason":"payload_parse_failed", "source_file": str(parquet_file)})
                    continue
            data = parsed.get("data", parsed) if isinstance(parsed, dict) else {}
            meta = parsed.get("meta", {}) if isinstance(parsed, dict) else {}

            # Build canonical row (raw types; finalize later)
            out = {
                "row_hash":    str(row_hash),
                "timestamp":   pd.to_datetime(ts_raw, errors="coerce", utc=True),
                "source_id":   meta.get("source_id") or kafka_key,
                "kafka_key":   kafka_key if kafka_key is not None else None,
                "offset":      None,
                "source_file": str(parquet_file),
            }
            # robust offset parse (allow negatives / strings)
            try:
                out["offset"] = int(offset) if offset is not None else None
            except Exception:
                out["offset"] = None

            missing = 0
            for f in features:
                raw_key = canonical_to_raw.get(f)
                v = data.get(raw_key) if raw_key is not None else None
                fv = _to_float_or_none(v)
                if fv is None: missing += 1
                out[f] = fv

            # Quality gate (feature completeness first)
            if len(features) and (missing / len(features)) > 0.30:
                dlq_rows.append({"row_hash": out["row_hash"], "reason":"too_many_missing",
                                 "null_frac": round(missing/len(features), 3),
                                 "source_file": str(parquet_file)})
            elif out["timestamp"] is None:
                dlq_rows.append({"row_hash": out["row_hash"], "reason":"null_or_bad_timestamp",
                                 "source_file": str(parquet_file)})
            else:
                ok_rows.append(out)

    # finalize types for OK rows + intra-file dedupe by row_hash
    if ok_rows:
        df = pd.DataFrame(ok_rows)
        before = len(df)
        df = df.drop_duplicates(subset=["row_hash"], keep="first")
        # timestamps -> microseconds (naive UTC)
        df["timestamp"] = pd.to_datetime(df["timestamp"], utc=True, errors="coerce").dt.tz_convert(None).astype("datetime64[us]")
        # date partition from timestamp
        df["date"] = pd.to_datetime(df["timestamp"], errors="coerce").dt.date.astype("object")  # Arrow will coerce to date32
        # drop any rows whose date could not be derived (should be none after timestamp check)
        df = df.dropna(subset=["date"])
        ok_rows = df.to_dict(orient="records")

    return ok_rows, dlq_rows


In [20]:
def delta_append_rows(rows: list) -> int:
    """Append rows to Delta using the Arrow schema (enforces date32/timestamp(us)/float64)."""
    if not rows:
        return 0
    df = pd.DataFrame(rows)

    # Guarantee all feature columns exist
    for f in features:
        if f not in df.columns:
            df[f] = None

    # Hard guard: timestamp / date must be non-null
    df = df.dropna(subset=["timestamp"])
    df = df.dropna(subset=["date"])
    if df.empty:
        return 0

    # Column order strictly follows SCHEMA
    df = df[[f.name for f in SCHEMA]]

    # Convert to Arrow using our schema (forces date32 & timestamp(us))
    table = pa.Table.from_pandas(df, schema=SCHEMA, preserve_index=False)
    write_deltalake(
        str(INFER_READY),
        table,
        mode="append",
        partition_by=["date"]
    )
    return len(df)

# ===== RUN BACKFILL (idempotent, lock-protected) =====
all_add_files = discover_add_file_paths_full()
ck = read_checkpoint()
already = set(ck.get("processed_files", []))
to_process = [p for p in all_add_files if p.name not in already]
print("Backfill candidates:", len(to_process))

if to_process:
    if not acquire_lock(timeout=30):
        raise RuntimeError("Could not acquire lock; another writer is active.")
    try:
        existing_keys = existing_row_hashes_from_delta()
        print("Existing infer-ready row_hash keys:", len(existing_keys))
        newly = []
        total_appended = 0
        for idx, pf in enumerate(to_process, 1):
            ok, dlq = to_canonical_rows(pf)
            # de-dupe vs Delta table (set membership)
            ok = [r for r in ok if r["row_hash"] not in existing_keys]
            appended = delta_append_rows(ok)
            total_appended += appended
            if appended:
                existing_keys.update([r["row_hash"] for r in ok])
            if dlq:
                dlq_write(dlq, f"dlq_backfill_{pf.stem}")
            newly.append(pf.name)
            if idx % 20 == 0:
                refresh_lock_heartbeat()
        # checkpoint
        new_ck = {"processed_files": sorted(already.union(newly)), "last_updated": int(time.time())}
        write_checkpoint_atomic(new_ck)
        print(f"Backfill done. Appended {total_appended} rows to Delta. Checkpoint updated with {len(newly)} files.")
    finally:
        release_lock()
else:
    print("Nothing to backfill. Skipping.")


Backfill candidates: 4
Existing infer-ready row_hash keys: 0
Backfill done. Appended 2000 rows to Delta. Checkpoint updated with 4 files.


In [None]:
POLL_SECS = 8
REFRESH_KEYS_EVERY = 60

print("Starting watcher… CTRL+C to stop.")
if not acquire_lock(timeout=30):
    raise RuntimeError("Could not acquire lock; another writer is active.")
try:
    ck = read_checkpoint()
    processed_files = set(ck.get("processed_files", []))
    existing_keys = existing_row_hashes_from_delta()
    log_state = read_log_state()
    last_log = log_state.get("last_log_file")

    print("Watcher bootstrap:", len(processed_files), "processed files;", len(existing_keys), "existing keys.")
    last_refresh = time.time()

    while True:
        last_log, candidates = discover_add_file_paths_incremental(last_log)
        candidates = [p for p in candidates if p.name not in processed_files and p.exists()]
        if candidates:
            total_app = 0
            newly = []
            for pf in candidates:
                ok, dlq = to_canonical_rows(pf)
                ok = [r for r in ok if r["row_hash"] not in existing_keys]
                appended = delta_append_rows(ok)
                total_app += appended
                if appended:
                    existing_keys.update([r["row_hash"] for r in ok])
                if dlq:
                    dlq_write(dlq, f"dlq_watch_{pf.stem}")
                newly.append(pf.name)
            processed_files.update(newly)
            write_checkpoint_atomic({"processed_files": sorted(processed_files), "last_updated": int(time.time())})
            write_log_state({"last_log_file": last_log})
            print(f"Watcher: processed {len(newly)} files; appended {total_app} rows.")
        if time.time() - last_refresh > REFRESH_KEYS_EVERY:
            existing_keys = existing_row_hashes_from_delta()
            last_refresh = time.time()
        refresh_lock_heartbeat()
        time.sleep(POLL_SECS)
except KeyboardInterrupt:
    print("Watcher interrupted by user.")
finally:
    release_lock()
    print("Watcher stopped; lock released.")


In [23]:
from deltalake import DeltaTable
import pyarrow as pa

if not (INFER_READY / "_delta_log").exists():
    print("Infer-ready Delta not initialized.")
else:
    dt = DeltaTable(str(INFER_READY))

    # Row count via light projection (prefer a narrow column)
    try:
        t_min = dt.to_pyarrow_table(columns=["row_hash"])
        total_rows = t_min.num_rows
    except Exception:
        t_all = dt.to_pyarrow_table()
        total_rows = t_all.num_rows

    # Get a real pyarrow.Schema
    try:
        pa_schema = dt.schema().to_pyarrow()   # delta-rs Schema -> pyarrow.Schema
    except Exception:
        pa_schema = dt.to_pyarrow_table().schema  # fallback

    # Safe field lookups
    names = set(pa_schema.names)
    date_field = pa_schema.field("date") if "date" in names else None
    ts_field   = pa_schema.field("timestamp") if "timestamp" in names else None

    print("Infer-ready row count:", f"{total_rows:,}")
    print("date field:", date_field.type if date_field else "MISSING")
    print("timestamp field:", ts_field.type if ts_field else "MISSING")


Infer-ready row count: 2,000
date field: date32[day]
timestamp field: timestamp[us]


In [24]:
import os, json, math, time
from pathlib import Path

import numpy as np
import pandas as pd
import pyarrow as pa
from deltalake import DeltaTable

ROOT = Path(r"C:\engine_module_pipeline")
INFER_READY = ROOT / r"delta\engine_module_infer_ready"
ARTIFACTS   = ROOT / "engine_module_artifacts"
FEATURES_JSON = ARTIFACTS / "features.json"

assert (INFER_READY / "_delta_log").exists(), f"Infer-ready Delta missing at {INFER_READY}"
assert FEATURES_JSON.exists(), f"Missing {FEATURES_JSON}"

features = json.loads(FEATURES_JSON.read_text(encoding="utf-8"))
if isinstance(features, dict) and "features" in features:
    features = features["features"]
assert isinstance(features, list) and len(features) == 25, "features.json must list exactly 25 canonical features"

SYSTEM_COLS = ["row_hash","timestamp","source_id","kafka_key","offset","source_file","date"]
EXPECTED_COLS = SYSTEM_COLS + features

print("✓ Loaded features (25). First 5:", features[:5])
print("✓ INFER_READY:", INFER_READY)


✓ Loaded features (25). First 5: ['Air_Fuel_Ratio_Commanded_:1', 'Air_Fuel_Ratio_Measured_:1', 'Catalyst_Temperature__Bank_1_Sensor_1', 'Catalyst_Temperature__Bank_1_Sensor_2', 'Engine_kW__At_the_wheels_kW']
✓ INFER_READY: C:\engine_module_pipeline\delta\engine_module_infer_ready


In [25]:
dt = DeltaTable(str(INFER_READY))

# Row count using a narrow projection
try:
    t_min = dt.to_pyarrow_table(columns=["row_hash"])
    total_rows = t_min.num_rows
except Exception:
    total_rows = dt.to_pyarrow_table().num_rows

# Schema -> pyarrow.Schema
try:
    pa_schema = dt.schema().to_pyarrow()
except Exception:
    pa_schema = dt.to_pyarrow_table().schema

names = pa_schema.names
print("\n=== SHAPE ===")
print("rows:", f"{total_rows:,}")
print("cols:", len(names))

print("\n=== SCHEMA (pyarrow) ===")
for f in pa_schema:
    print(f"- {f.name}: {f.type}{'' if f.nullable else ' not null'}")

# Partitions on disk
parts = sorted([p.name for p in INFER_READY.glob("date=*") if p.is_dir()])
print("\n=== PARTITIONS (first 20) ===")
print(parts[:20])

# Time coverage
ts_tbl = dt.to_pyarrow_table(columns=["timestamp","date"])
ts_df = ts_tbl.to_pandas()
if ts_df.empty:
    print("\n=== TIME COVERAGE ===\n(table empty)")
else:
    tmin = pd.to_datetime(ts_df["timestamp"]).min()
    tmax = pd.to_datetime(ts_df["timestamp"]).max()
    dmin = pd.to_datetime(ts_df["date"]).min()
    dmax = pd.to_datetime(ts_df["date"]).max()
    print("\n=== TIME COVERAGE ===")
    print("timestamp min:", tmin)
    print("timestamp max:", tmax)
    print("date      min:", None if pd.isna(dmin) else dmin.date())
    print("date      max:", None if pd.isna(dmax) else dmax.date())



=== SHAPE ===
rows: 2,000
cols: 32

=== SCHEMA (pyarrow) ===
- row_hash: string not null
- timestamp: timestamp[us]
- source_id: string
- kafka_key: string
- offset: int64
- source_file: string
- date: date32[day] not null
- Air_Fuel_Ratio_Commanded_:1: double
- Air_Fuel_Ratio_Measured_:1: double
- Catalyst_Temperature__Bank_1_Sensor_1: double
- Catalyst_Temperature__Bank_1_Sensor_2: double
- Engine_kW__At_the_wheels_kW: double
- Engine_Load_Absolute_pct: double
- Engine_Oil_Temperature: double
- Engine_RPM_rpm: double
- Fuel_flow_rate_hour_l_hr: double
- Fuel_Trim_Bank_1_Long_Term_pct: double
- Fuel_Trim_Bank_1_Short_Term_pct: double
- Mass_Air_Flow_Rate_g_s: double
- O2_Sensor1_Wide_Range_Current_mA: double
- O2_Bank_1_Sensor_2_Voltage_V: double
- Run_time_since_engine_start_s: double
- Timing_Advance: double
- Turbo_Boost_&_Vacuum_Gauge_psi: double
- Voltage__Control_Module_V: double
- Volumetric_Efficiency__Calculated_pct: double
- ECU_7EA:_Engine_Coolant_Temperature: double
- ECU

In [26]:
actual_cols = set(names)
expected_cols = set(EXPECTED_COLS)

missing = sorted(list(expected_cols - actual_cols))
extras  = sorted(list(actual_cols - expected_cols))

print("\n=== COLUMN SET CHECK ===")
print("Missing columns:", missing)
print("Unexpected columns:", extras)

# Order check (not required by Delta itself, but good for humans)
ideal_order = EXPECTED_COLS
order_ok = list(names) == ideal_order
print("\n=== COLUMN ORDER CHECK ===")
print("Matches ideal order from contract?:", order_ok)

# Spot-print first 10 column names
print("\nFirst 10 columns:", list(names)[:10])



=== COLUMN SET CHECK ===
Missing columns: []
Unexpected columns: []

=== COLUMN ORDER CHECK ===
Matches ideal order from contract?: True

First 10 columns: ['row_hash', 'timestamp', 'source_id', 'kafka_key', 'offset', 'source_file', 'date', 'Air_Fuel_Ratio_Commanded_:1', 'Air_Fuel_Ratio_Measured_:1', 'Catalyst_Temperature__Bank_1_Sensor_1']


In [27]:
tbl_all = dt.to_pyarrow_table()
pdf = tbl_all.to_pandas()  # fine at 2k rows; revisit if table grows huge

n = len(pdf)
stats = []
for c in pdf.columns:
    nulls = int(pd.isna(pdf[c]).sum())
    stats.append((c, n, n - nulls, nulls, (nulls / n if n > 0 else 0.0)))

stats.sort(key=lambda x: x[4])  # by null_frac ASC (most complete first)

print("\n=== COMPLETENESS (top 15 by best completeness) ===")
for name, tot, non_null, nulls, frac in stats[:15]:
    print(f"{name:32s} total={tot:7d} non_null={non_null:7d} nulls={nulls:7d} null_frac={frac:5.3f}")

print("\n=== COMPLETENESS (worst 10 by null_frac) ===")
for name, tot, non_null, nulls, frac in stats[-10:]:
    print(f"{name:32s} total={tot:7d} non_null={non_null:7d} nulls={nulls:7d} null_frac={frac:5.3f}")



=== COMPLETENESS (top 15 by best completeness) ===
row_hash                         total=   2000 non_null=   2000 nulls=      0 null_frac=0.000
timestamp                        total=   2000 non_null=   2000 nulls=      0 null_frac=0.000
source_id                        total=   2000 non_null=   2000 nulls=      0 null_frac=0.000
kafka_key                        total=   2000 non_null=   2000 nulls=      0 null_frac=0.000
offset                           total=   2000 non_null=   2000 nulls=      0 null_frac=0.000
source_file                      total=   2000 non_null=   2000 nulls=      0 null_frac=0.000
date                             total=   2000 non_null=   2000 nulls=      0 null_frac=0.000
Air_Fuel_Ratio_Commanded_:1      total=   2000 non_null=   2000 nulls=      0 null_frac=0.000
Air_Fuel_Ratio_Measured_:1       total=   2000 non_null=   2000 nulls=      0 null_frac=0.000
Catalyst_Temperature__Bank_1_Sensor_1 total=   2000 non_null=   2000 nulls=      0 null_frac=0.000
Cat

In [28]:
problems = []

dups = pdf["row_hash"].value_counts()
dups = dups[dups > 1]
print("\n=== ROW HASH UNIQUENESS ===")
if dups.empty:
    print("All row_hash values are unique.")
else:
    print("Duplicate row_hash count:", len(dups))
    print("Sample duplicates:", list(dups.head(5).items()))
    problems.append("DUP_ROW_HASH")

ts = pd.to_datetime(pdf["timestamp"], errors="coerce", utc=True)
date_from_ts = ts.dt.tz_convert(None).dt.date.astype("object")
mismatch = (date_from_ts != pd.to_datetime(pdf["date"], errors="coerce").dt.date)
mismatch_cnt = int(mismatch.sum())
print("\n=== DATE VS TIMESTAMP FLOOR ===")
print("Mismatched rows:", mismatch_cnt)
if mismatch_cnt > 0:
    print(pdf.loc[mismatch, ["row_hash","timestamp","date"]].head(5))
    problems.append("DATE_TS_MISMATCH")

print("\n=== SYSTEM COL TYPES (pandas dtypes) ===")
print(pdf[SYSTEM_COLS].dtypes)

non_int_offsets = pdf["offset"].dropna().map(lambda x: isinstance(x, (int, np.integer))).value_counts()
print("\n=== OFFSET TYPE CHECK ===")
print("Is int-like distribution:", dict(non_int_offsets))
if not non_int_offsets.get(True, 0) == len(pdf["offset"].dropna()):
    problems.append("OFFSET_NON_INT")

print("\n=== PROBLEMS SUMMARY ===")
print("Problems:", problems if problems else "None")



=== ROW HASH UNIQUENESS ===
All row_hash values are unique.

=== DATE VS TIMESTAMP FLOOR ===
Mismatched rows: 0

=== SYSTEM COL TYPES (pandas dtypes) ===
row_hash               object
timestamp      datetime64[us]
source_id              object
kafka_key              object
offset                  int64
source_file            object
date                   object
dtype: object

=== OFFSET TYPE CHECK ===
Is int-like distribution: {True: 2000}

=== PROBLEMS SUMMARY ===
Problems: None


In [29]:
feat_stats = []
nonfinite_any = False
for f in features:
    col = pd.to_numeric(pdf[f], errors="coerce")
    nn = int(col.notna().sum())
    nz = int((col == 0).sum(skipna=True)) if nn else 0
    vals = col.dropna()
    if nn:
        mn = float(vals.min())
        mx = float(vals.max())
        mean = float(vals.mean())
        std = float(vals.std(ddof=1)) if len(vals) > 1 else 0.0
        p05 = float(vals.quantile(0.05)) if len(vals) > 1 else mn
        p95 = float(vals.quantile(0.95)) if len(vals) > 1 else mx
    else:
        mn = mx = mean = std = p05 = p95 = float("nan")
    nonfinite = int(np.isfinite(vals).sum()) != len(vals)
    nonfinite_any = nonfinite_any or nonfinite
    nulls = len(pdf) - nn
    feat_stats.append((f, nn, nulls, nulls/len(pdf) if len(pdf) else 0.0, mn, mx, mean, std, p05, p95, nonfinite))

feat_stats.sort(key=lambda x: x[2], reverse=True)  # sort by null count desc (worst first)

print("\n=== FEATURES: WORST 10 BY NULL COUNT ===")
for f, nn, nulls, frac, mn, mx, mean, std, p05, p95, nf in feat_stats[:10]:
    print(f"{f:35s} nn={nn:5d} nulls={nulls:5d} null_frac={frac:5.3f} "
          f"min={mn:.6g} p05={p05:.6g} mean={mean:.6g} p95={p95:.6g} max={mx:.6g} std={std:.6g} nonfinite={nf}")

print("\n=== FEATURES: BEST 10 BY COMPLETENESS ===")
best = sorted(feat_stats, key=lambda x: x[2])[:10]
for f, nn, nulls, frac, mn, mx, mean, std, p05, p95, nf in best:
    print(f"{f:35s} nn={nn:5d} nulls={nulls:5d} null_frac={frac:5.3f} "
          f"min={mn:.6g} p05={p05:.6g} mean={mean:.6g} p95={p95:.6g} max={mx:.6g} std={std:.6g} nonfinite={nf}")

p



=== FEATURES: WORST 10 BY NULL COUNT ===
Air_Fuel_Ratio_Commanded_:1         nn= 2000 nulls=    0 null_frac=0.000 min=9.64949 p05=13.316 mean=13.7007 p95=14.0882 max=14.8418 std=0.755145 nonfinite=False
Air_Fuel_Ratio_Measured_:1          nn= 2000 nulls=    0 null_frac=0.000 min=11.5725 p05=13.1274 mean=15.0592 p95=17.3593 max=17.3593 std=1.44844 nonfinite=False
Catalyst_Temperature__Bank_1_Sensor_1 nn= 2000 nulls=    0 null_frac=0.000 min=461.8 p05=474.9 mean=552.685 p95=607.6 max=633.9 std=39.0235 nonfinite=False
Catalyst_Temperature__Bank_1_Sensor_2 nn= 2000 nulls=    0 null_frac=0.000 min=282.3 p05=301.1 mean=424.801 p95=503.7 max=531 std=61.7923 nonfinite=False
Engine_kW__At_the_wheels_kW         nn= 2000 nulls=    0 null_frac=0.000 min=0.255514 p05=0.668211 mean=0.667337 p95=0.668211 max=1.32983 std=0.0631608 nonfinite=False
Engine_Load_Absolute_pct            nn= 2000 nulls=    0 null_frac=0.000 min=9.41177 p05=16.0784 mean=51.6355 p95=78.0392 max=101.176 std=16.5171 nonfinite=

WindowsPath('C:/engine_module_pipeline/delta/engine_module_infer_ready/date=2025-09-16')

In [30]:
print("\n=== SPARK BACKTICK HINT (columns that need quoting) ===")
needs_backticks = [c for c in EXPECTED_COLS if not c.isidentifier() or any(ch in c for ch in [' ',':','&','-','/','(',')','%'])]
print("Count:", len(needs_backticks))
print("Examples:", needs_backticks[:10])

print("\n=== HEAD(5) — minimal columns for visual sanity ===")
cols_show = ["row_hash","timestamp","source_id","kafka_key","offset","source_file","date"] + features[:3]
print(pdf[cols_show].head(5))



=== SPARK BACKTICK HINT (columns that need quoting) ===
Count: 9
Examples: ['Air_Fuel_Ratio_Commanded_:1', 'Air_Fuel_Ratio_Measured_:1', 'Turbo_Boost_&_Vacuum_Gauge_psi', 'ECU_7EA:_Engine_Coolant_Temperature', 'ECU_7EA:_Intake_Air_Temperature', 'ECU_7EB:_Ambient_air_temp', 'ECU_7EB:_Engine_Load_pct', 'ECU_7EB:_Engine_RPM_rpm', 'ECU_7EB:_Speed__OBD_km_h']

=== HEAD(5) — minimal columns for visual sanity ===
                                            row_hash               timestamp source_id kafka_key  offset                                        source_file  \
0  d3d180f31455aa45ad65af08914f7e9038861eb75c800a... 2025-09-16 07:00:15.797    sim001    sim001    1500  C:\engine_module_pipeline\delta\engine_module_...   
1  d0b0fa020237ffd78f6120c946eabd80d2763cedc815ad... 2025-09-16 07:00:15.797    sim001    sim001    1501  C:\engine_module_pipeline\delta\engine_module_...   
2  5e2f4fb2d4a5b8b56f1caa1ee20a9427a6f078364ba8be... 2025-09-16 07:00:15.797    sim001    sim001    1502  C:\eng

In [31]:

def _is_list_of_dicts(v):
    if not isinstance(v, list): return False
    for d in v:
        if not isinstance(d, dict): return False
        if "feature" not in d or "contribution" not in d: return False
    return True

def _is_map_float(d):
    if not isinstance(d, dict): return False
    for k, v in d.items():
        try:
            _ = float(v) if v is not None else 0.0
        except Exception:
            return False
    return True

sample = pdf.head(20).copy()
df_out_shape = sample.copy()
df_out_shape["recon_error_dense"] = 0.0
df_out_shape["dense_per_feature_error"] = [{f: 0.0 for f in features} for _ in range(len(sample))]
df_out_shape["recon_error_lstm"] = None
df_out_shape["lstm_window_id"] = None
df_out_shape["isolation_score"] = 0.0
df_out_shape["kde_logp"] = 0.0
df_out_shape["gmm_logp"] = 0.0
df_out_shape["combiner_score"] = None
df_out_shape["composite_score"] = 0.5
df_out_shape["anomaly_label"] = "suspicious"
df_out_shape["anomaly_severity"] = 1
df_out_shape["model_versions"] = {"dense":"ts","lstm":"sd","isof":"joblib"}
df_out_shape["inference_run_id"] = "run-preflight"
df_out_shape["inference_ts"] = pd.Timestamp.utcnow()
df_out_shape["processing_latency_ms"] = None
df_out_shape["explain_top_k"] = [[{"feature": features[0], "contribution": 0.1}]] * len(sample)
df_out_shape["raw_model_outputs"] = {}
df_out_shape["notes"] = None
df_out_shape["date"] = pd.to_datetime(df_out_shape["timestamp"]).dt.date.astype(str)

violations = []
for idx, r in df_out_shape.head(5).iterrows():
    if not _is_list_of_dicts(r["explain_top_k"]): violations.append(("explain_top_k", idx))
    if not _is_map_float(r["dense_per_feature_error"]): violations.append(("dense_per_feature_error", idx))
    if not isinstance(r["model_versions"], dict): violations.append(("model_versions", idx))
    if not (isinstance(r["anomaly_severity"], (int, np.integer))): violations.append(("anomaly_severity", idx))
    if r["date"] is None or not isinstance(r["date"], str): violations.append(("date", idx))

print("\n=== PREFLIGHT STRUCTURAL CHECK (df_out-shaped) ===")
print("Violations:", violations if violations else "None")
print("Sample columns present:", list(df_out_shape.columns)[:12], "…")



=== PREFLIGHT STRUCTURAL CHECK (df_out-shaped) ===
Violations: [('model_versions', 0), ('model_versions', 1), ('model_versions', 2), ('model_versions', 3), ('model_versions', 4)]
Sample columns present: ['row_hash', 'timestamp', 'source_id', 'kafka_key', 'offset', 'source_file', 'date', 'Air_Fuel_Ratio_Commanded_:1', 'Air_Fuel_Ratio_Measured_:1', 'Catalyst_Temperature__Bank_1_Sensor_1', 'Catalyst_Temperature__Bank_1_Sensor_2', 'Engine_kW__At_the_wheels_kW'] …


In [32]:

issues = []

if len(missing) > 0: issues.append(f"Missing columns: {missing}")
if len(extras)  > 0: issues.append(f"Unexpected columns: {extras}")

if "DUP_ROW_HASH" in locals().get("problems", []): issues.append("Duplicate row_hash found")
if "DATE_TS_MISMATCH" in locals().get("problems", []): issues.append("date != floor(timestamp) mismatch present")
if "OFFSET_NON_INT" in locals().get("problems", []): issues.append("offset contains non-integers")

high_null_feats = [f for f, nn, nulls, frac, *_ in feat_stats if frac > 0.30]
if high_null_feats: issues.append(f"Features with >30% nulls: {high_null_feats[:5]}{'…' if len(high_null_feats)>5 else ''}")

print("\n=== GO / NO-GO FOR INFERENCE ===")
if not issues:
    print("GO ✅  — Inference notebook can read this table safely.")
else:
    print("NO-GO  — Please fix before inference.")
    for it in issues:
        print(" -", it)



=== GO / NO-GO FOR INFERENCE ===
GO ✅  — Inference notebook can read this table safely.
