In [28]:
# T-1 — Paths, imports, device, canonical feature contract

import os, json, math, time, uuid, warnings
from pathlib import Path
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt

from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import KernelDensity
from sklearn.mixture import GaussianMixture

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# ----- Paths -----
ROOT         = Path(r"C:\engine_module_pipeline")
INFER_STAGE  = ROOT / "infer_stage"
ARTIFACTS    = INFER_STAGE / "artifacts"
DATA_CSV     = ROOT / r"data\csv\engine_module.csv"

ARTIFACTS.mkdir(parents=True, exist_ok=True)

# ----- Device -----
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", DEVICE)

# ----- Canonical feature list (order-sensitive, used by ALL models) -----
FEATURES_JSON_CANON = ROOT / r"engine_module_artifacts\features.json"
if FEATURES_JSON_CANON.exists():
    features = json.loads(FEATURES_JSON_CANON.read_text(encoding="utf-8"))
    if isinstance(features, dict) and "features" in features:
        features = features["features"]
else:
    # Fallback: hardcode (must match your canonical order)
    features = [
        "Air_Fuel_Ratio_Commanded_:1","Air_Fuel_Ratio_Measured_:1",
        "Catalyst_Temperature__Bank_1_Sensor_1","Catalyst_Temperature__Bank_1_Sensor_2",
        "Engine_kW__At_the_wheels_kW","Engine_Load_Absolute_pct","Engine_Oil_Temperature","Engine_RPM_rpm",
        "Fuel_flow_rate_hour_l_hr","Fuel_Trim_Bank_1_Long_Term_pct","Fuel_Trim_Bank_1_Short_Term_pct",
        "Mass_Air_Flow_Rate_g_s","O2_Sensor1_Wide_Range_Current_mA","O2_Bank_1_Sensor_2_Voltage_V",
        "Run_time_since_engine_start_s","Timing_Advance","Turbo_Boost_&_Vacuum_Gauge_psi",
        "Voltage__Control_Module_V","Volumetric_Efficiency__Calculated_pct",
        "ECU_7EA:_Engine_Coolant_Temperature","ECU_7EA:_Intake_Air_Temperature",
        "ECU_7EB:_Ambient_air_temp","ECU_7EB:_Engine_Load_pct","ECU_7EB:_Engine_RPM_rpm","ECU_7EB:_Speed__OBD_km_h",
    ]
assert len(features) == 25

# ----- Raw CSV column names (with punctuation) -----
RAW_COLS = [
    "Air Fuel Ratio(Commanded)(:1)","Air Fuel Ratio(Measured)(:1)",
    "Catalyst Temperature (Bank 1 Sensor 1)","Catalyst Temperature (Bank 1 Sensor 2)",
    "Engine kW (At the wheels)(kW)","Engine Load(Absolute)(%)","Engine Oil Temperature","Engine RPM(rpm)",
    "Fuel flow rate/hour(l/hr)","Fuel Trim Bank 1 Long Term(%)","Fuel Trim Bank 1 Short Term(%)",
    "Mass Air Flow Rate(g/s)","O2 Sensor1 Wide Range Current(mA)","O2 Bank 1 Sensor 2 Voltage(V)",
    "Run time since engine start(s)","Timing Advance","Turbo Boost & Vacuum Gauge(psi)",
    "Voltage (Control Module)(V)","Volumetric Efficiency (Calculated)(%)",
    "ECU(7EA): Engine Coolant Temperature","ECU(7EA): Intake Air Temperature",
    "ECU(7EB): Ambient air temp","ECU(7EB): Engine Load(%)","ECU(7EB): Engine RPM(rpm)","ECU(7EB): Speed (OBD)(km/h)",
]
TIMESTAMP_COL = "timestamp"

# ----- Deterministic raw->canonical mapping (1:1) -----
RAW_TO_CANON = {
    "Air Fuel Ratio(Commanded)(:1)": "Air_Fuel_Ratio_Commanded_:1",
    "Air Fuel Ratio(Measured)(:1)": "Air_Fuel_Ratio_Measured_:1",
    "Catalyst Temperature (Bank 1 Sensor 1)": "Catalyst_Temperature__Bank_1_Sensor_1",
    "Catalyst Temperature (Bank 1 Sensor 2)": "Catalyst_Temperature__Bank_1_Sensor_2",
    "Engine kW (At the wheels)(kW)": "Engine_kW__At_the_wheels_kW",
    "Engine Load(Absolute)(%)": "Engine_Load_Absolute_pct",
    "Engine Oil Temperature": "Engine_Oil_Temperature",
    "Engine RPM(rpm)": "Engine_RPM_rpm",
    "Fuel flow rate/hour(l/hr)": "Fuel_flow_rate_hour_l_hr",
    "Fuel Trim Bank 1 Long Term(%)": "Fuel_Trim_Bank_1_Long_Term_pct",
    "Fuel Trim Bank 1 Short Term(%)": "Fuel_Trim_Bank_1_Short_Term_pct",
    "Mass Air Flow Rate(g/s)": "Mass_Air_Flow_Rate_g_s",
    "O2 Sensor1 Wide Range Current(mA)": "O2_Sensor1_Wide_Range_Current_mA",
    "O2 Bank 1 Sensor 2 Voltage(V)": "O2_Bank_1_Sensor_2_Voltage_V",
    "Run time since engine start(s)": "Run_time_since_engine_start_s",
    "Timing Advance": "Timing_Advance",
    "Turbo Boost & Vacuum Gauge(psi)": "Turbo_Boost_&_Vacuum_Gauge_psi",
    "Voltage (Control Module)(V)": "Voltage__Control_Module_V",
    "Volumetric Efficiency (Calculated)(%)": "Volumetric_Efficiency__Calculated_pct",
    "ECU(7EA): Engine Coolant Temperature": "ECU_7EA:_Engine_Coolant_Temperature",
    "ECU(7EA): Intake Air Temperature": "ECU_7EA:_Intake_Air_Temperature",
    "ECU(7EB): Ambient air temp": "ECU_7EB:_Ambient_air_temp",
    "ECU(7EB): Engine Load(%)": "ECU_7EB:_Engine_Load_pct",
    "ECU(7EB): Engine RPM(rpm)": "ECU_7EB:_Engine_RPM_rpm",
    "ECU(7EB): Speed (OBD)(km/h)": "ECU_7EB:_Speed__OBD_km_h",
}
assert set(RAW_TO_CANON.values()) == set(features)

# ----- Windows-safe per-feature filename sanitizer (for KDE/GMM only) -----
def sanitize_for_filename(feat: str) -> str:
    return feat.replace(":", "__COLON__")

# ----- Save the “single source of truth” model input contract -----
MODEL_INPUT_CONTRACT = {
    "timestamp_column": TIMESTAMP_COL,
    "features_canonical_order": features,   # required order for ALL 5 models
    "raw_to_canonical_map": RAW_TO_CANON,   # CSV→canonical
    "per_feature_filename_rule": {"colon": "__COLON__"},  # used for KDE/GMM artifacts only
    "inference_expectation": {
        "infer_ready_columns": ["row_hash","timestamp","source_id","kafka_key","offset","source_file","date"] + features,
        "timestamp_tz": "UTC",
        "feature_dtype": "float64 (castable to float32 for model input)",
        "scaler": "RobustScaler(25th–75th) fitted on train only"
    }
}
(ARTIFACTS / "model_input_contract.json").write_text(json.dumps(MODEL_INPUT_CONTRACT, indent=2), encoding="utf-8")
print("Saved:", ARTIFACTS / "model_input_contract.json")


Device: cuda
Saved: C:\engine_module_pipeline\infer_stage\artifacts\model_input_contract.json


In [30]:
import torch
print("PyTorch version:", torch.__version__)
print("CUDA available :", torch.cuda.is_available())
print("CUDA device    :", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")


PyTorch version: 2.5.1+cu121
CUDA available : True
CUDA device    : NVIDIA GeForce GTX 1650


In [31]:
# T-2 — Load CSV, rename to canonical, clean, split, scale

assert DATA_CSV.exists(), f"Missing data CSV: {DATA_CSV}"
df_raw = pd.read_csv(DATA_CSV)
assert TIMESTAMP_COL in df_raw.columns
assert set(RAW_COLS).issubset(df_raw.columns)

# Rename
df = df_raw[[TIMESTAMP_COL] + RAW_COLS].rename(columns={c: RAW_TO_CANON[c] for c in RAW_COLS})
df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce", utc=True)

# Drop rows with >30% missing across the 25 features (same rule as infer-ready writer)
feat_df = df[features]
keep_mask = feat_df.isna().mean(axis=1) <= 0.30
df = df.loc[keep_mask].reset_index(drop=True)
feat_df = df[features].astype(float)

# Time sort (important for LSTM)
df = df.sort_values("timestamp").reset_index(drop=True)
X = feat_df.values.astype(np.float32)

# 80/20 split (time-based)
split_idx = int(0.8 * len(df))
X_train = X[:split_idx]
X_val   = X[split_idx:]
print("RAW shapes:", X_train.shape, X_val.shape)

# Scaling (fit on train only)
scaler = RobustScaler(quantile_range=(25.0, 75.0))
X_train_s = scaler.fit_transform(X_train)
X_val_s   = scaler.transform(X_val)

joblib.dump(scaler, ARTIFACTS / "scaler_robust.joblib")
(ARTIFACTS / "features.json").write_text(json.dumps(features, indent=2), encoding="utf-8")
print("Saved scaler & features.")


RAW shapes: (25439, 25) (6360, 25)
Saved scaler & features.


In [32]:
# T-3 — Dense AE (TorchScript + SD) + loss plot

torch.manual_seed(42); np.random.seed(42)
input_dim = len(features)
hidden = 128

class DenseAE(nn.Module):
    def __init__(self, d_in=input_dim, h=hidden):
        super().__init__()
        self.enc = nn.Sequential(
            nn.Linear(d_in, h), nn.ReLU(),
            nn.Linear(h, h//2), nn.ReLU(),
        )
        self.dec = nn.Sequential(
            nn.Linear(h//2, h), nn.ReLU(),
            nn.Linear(h, d_in),
        )
    def forward(self, x):  # x: [B,F]
        z = self.enc(x)
        return self.dec(z)

def train_dense(Xtr, Xva, epochs=400, batch=2048, lr=1e-3, patience=40):
    dl_tr = DataLoader(torch.from_numpy(Xtr), batch_size=batch, shuffle=True)
    dl_va = DataLoader(torch.from_numpy(Xva), batch_size=batch, shuffle=False)
    model = DenseAE().to(DEVICE)
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    crit = nn.MSELoss()
    best=np.inf; bad=0; best_state=None
    hist_tr, hist_va = [], []
    for ep in range(1, epochs+1):
        model.train(); tr=0.0
        for xb in dl_tr:
            xb = xb.to(DEVICE)
            opt.zero_grad()
            yhat = model(xb)
            loss = crit(yhat, xb)
            loss.backward(); opt.step()
            tr += loss.item() * xb.size(0)
        tr /= len(Xtr)

        model.eval(); va=0.0
        with torch.no_grad():
            for xb in dl_va:
                xb = xb.to(DEVICE)
                va += crit(model(xb), xb).item() * xb.size(0)
        va /= max(1,len(Xva))
        hist_tr.append(tr); hist_va.append(va)

        if va < best - 1e-7:
            best = va; best_state = {k:v.detach().cpu().clone() for k,v in model.state_dict().items()}; bad=0
        else:
            bad += 1
        if bad >= patience: break
        if ep % 20 == 0 or ep==1:
            print(f"[DenseAE] ep {ep} tr={tr:.6f} va={va:.6f}")

    model.load_state_dict(best_state)
    return model, hist_tr, hist_va

dense_model, tr_hist, va_hist = train_dense(X_train_s, X_val_s)

# Save state dict (fallback for existing loader)
DENSE_SD = ARTIFACTS / "model_dense_best_state_dict.pt"
torch.save(dense_model.state_dict(), DENSE_SD)

# Save TorchScript (preferred)
dense_cpu = DenseAE().cpu(); dense_cpu.load_state_dict(torch.load(DENSE_SD, map_location="cpu")); dense_cpu.eval()
example = torch.from_numpy(X_val_s[:8] if len(X_val_s)>=8 else X_train_s[:8]).float()
ts_dense = torch.jit.trace(dense_cpu, example)
DENSE_TS = ARTIFACTS / "model_dense_best_torchscript.pt"
ts_dense.save(str(DENSE_TS))

# Loss plot
plt.figure()
plt.plot(tr_hist, label="train"); plt.plot(va_hist, label="val")
plt.xlabel("epoch"); plt.ylabel("MSE"); plt.legend(); plt.title("DenseAE Loss")
plt.tight_layout()
plt.savefig(ARTIFACTS / "dense_loss.png", dpi=150); plt.close()

print("Saved:", DENSE_SD, DENSE_TS, ARTIFACTS / "dense_loss.png")


[DenseAE] ep 1 tr=33.201632 va=43.421375
[DenseAE] ep 20 tr=0.246179 va=0.393115
[DenseAE] ep 40 tr=0.085552 va=0.172796
[DenseAE] ep 60 tr=0.050019 va=0.136548
[DenseAE] ep 80 tr=0.036052 va=0.100315
[DenseAE] ep 100 tr=0.029672 va=0.085582
[DenseAE] ep 120 tr=0.024741 va=0.073865
[DenseAE] ep 140 tr=0.023720 va=0.086461
[DenseAE] ep 160 tr=0.016935 va=0.057797
[DenseAE] ep 180 tr=0.015978 va=0.052623
[DenseAE] ep 200 tr=0.014596 va=0.050150
[DenseAE] ep 220 tr=0.011402 va=0.043019
[DenseAE] ep 240 tr=0.011075 va=0.035472
[DenseAE] ep 260 tr=0.012298 va=0.034297
[DenseAE] ep 280 tr=0.009241 va=0.032673
[DenseAE] ep 300 tr=0.008677 va=0.030970
[DenseAE] ep 320 tr=0.007531 va=0.039391
[DenseAE] ep 340 tr=0.009405 va=0.040816
[DenseAE] ep 360 tr=0.006290 va=0.025310
[DenseAE] ep 380 tr=0.005496 va=0.022167
[DenseAE] ep 400 tr=0.008814 va=0.027946
Saved: C:\engine_module_pipeline\infer_stage\artifacts\model_dense_best_state_dict.pt C:\engine_module_pipeline\infer_stage\artifacts\model_den

In [34]:
# T-4 — LSTM AE (TorchScript + SD named exactly as your loader expects)

SEQ = 10  # causal windows, consistent with inference

def build_sequences(X, seq=SEQ):
    Xs = []
    for i in range(seq-1, len(X)):
        Xs.append(X[i-seq+1:i+1])
    return np.stack(Xs, axis=0).astype(np.float32) if Xs else np.empty((0,seq,X.shape[1]), dtype=np.float32)

Xtr_seq = build_sequences(X_train_s, SEQ)
Xva_seq = build_sequences(X_val_s,   SEQ)
print("Seq shapes:", Xtr_seq.shape, Xva_seq.shape)

class LSTMAE(nn.Module):
    def __init__(self, d_in=len(features), enc_h=64, dec_h=64, layers=1):
        super().__init__()
        self.encoder = nn.LSTM(input_size=d_in, hidden_size=enc_h, num_layers=layers, batch_first=True)
        self.decoder = nn.LSTM(input_size=enc_h, hidden_size=dec_h, num_layers=layers, batch_first=True)
        self.out = nn.Linear(dec_h, d_in)
    def forward(self, x):             # x: [B,T,F]
        enc_out,_ = self.encoder(x)   # [B,T,H]
        dec_out,_ = self.decoder(enc_out)
        y = self.out(dec_out[:,-1,:]) # last-step recon [B,F]
        return y

class SeqDS(Dataset):
    def __init__(self, X): self.X = torch.from_numpy(X).float()
    def __len__(self): return len(self.X)
    def __getitem__(self, i): return self.X[i]

def train_lstm(Xtr, Xva, epochs=400, batch=512, lr=1e-3, patience=40):
    dl_tr = DataLoader(SeqDS(Xtr), batch_size=batch, shuffle=True)
    dl_va = DataLoader(SeqDS(Xva), batch_size=batch, shuffle=False)
    model = LSTMAE().to(DEVICE)
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    crit = nn.MSELoss()
    best=np.inf; bad=0; best_state=None
    tr_hist, va_hist = [], []
    for ep in range(1, epochs+1):
        model.train(); tr=0.0
        for xb in dl_tr:
            xb = xb.to(DEVICE)
            opt.zero_grad()
            yhat = model(xb)
            y    = xb[:,-1,:]
            loss = crit(yhat, y)
            loss.backward(); opt.step()
            tr += loss.item() * xb.size(0)
        tr /= len(Xtr)

        model.eval(); va=0.0
        with torch.no_grad():
            for xb in dl_va:
                xb = xb.to(DEVICE)
                yhat = model(xb); y = xb[:,-1,:]
                va += crit(yhat, y).item() * xb.size(0)
        va /= max(1,len(Xva))
        tr_hist.append(tr); va_hist.append(va)

        if va < best - 1e-7:
            best = va; best_state = {k:v.detach().cpu().clone() for k,v in model.state_dict().items()}; bad=0
        else:
            bad += 1
        if bad >= patience: break
        if ep % 20 == 0 or ep==1:
            print(f"[LSTMAE] ep {ep} tr={tr:.6f} va={va:.6f}")

    model.load_state_dict(best_state)
    return model, tr_hist, va_hist

lstm_model, tr_l, va_l = train_lstm(Xtr_seq, Xva_seq)

# Save state dict with the EXACT filename your inference loader expects
LSTM_SD = ARTIFACTS / "model_lstm_long_best.pt"  # <- compatibility with your current loader
torch.save(lstm_model.state_dict(), LSTM_SD)

# TorchScript export (preferred; you can point ARTS['lstm_ts'] to this)
lstm_cpu = LSTMAE().cpu(); lstm_cpu.load_state_dict(torch.load(LSTM_SD, map_location="cpu")); lstm_cpu.eval()
example = torch.from_numpy(Xva_seq[:8] if len(Xva_seq)>=8 else Xtr_seq[:8]).float()
ts_lstm = torch.jit.trace(lstm_cpu, example)
LSTM_TS = ARTIFACTS / "model_lstm_long_best_torchscript.pt"
ts_lstm.save(str(LSTM_TS))

# Loss plot
plt.figure()
plt.plot(tr_l, label="train"); plt.plot(va_l, label="val")
plt.xlabel("epoch"); plt.ylabel("MSE"); plt.legend(); plt.title("LSTMAE Loss (last-step)")
plt.tight_layout()
plt.savefig(ARTIFACTS / "lstm_loss.png", dpi=150); plt.close()

print("Saved:", LSTM_SD, LSTM_TS, ARTIFACTS / "lstm_loss.png")


Seq shapes: (25430, 10, 25) (6351, 10, 25)
[LSTMAE] ep 1 tr=34.248627 va=49.941059
[LSTMAE] ep 20 tr=8.664499 va=12.270056
[LSTMAE] ep 40 tr=3.464946 va=4.210563
[LSTMAE] ep 60 tr=1.821499 va=2.086655
[LSTMAE] ep 80 tr=1.149877 va=1.407794
[LSTMAE] ep 100 tr=0.786528 va=1.210577
[LSTMAE] ep 120 tr=0.530290 va=0.994820
[LSTMAE] ep 140 tr=0.530129 va=1.003904
[LSTMAE] ep 160 tr=0.311875 va=0.856669
[LSTMAE] ep 180 tr=0.235072 va=0.785827
[LSTMAE] ep 200 tr=0.183451 va=0.785236
[LSTMAE] ep 220 tr=0.154830 va=0.681515
[LSTMAE] ep 240 tr=0.136194 va=0.659810
[LSTMAE] ep 260 tr=0.105469 va=0.582793
[LSTMAE] ep 280 tr=0.093608 va=0.545961
[LSTMAE] ep 300 tr=0.081772 va=0.553259
[LSTMAE] ep 320 tr=0.068287 va=0.546453
[LSTMAE] ep 340 tr=0.061899 va=0.493811
[LSTMAE] ep 360 tr=0.059055 va=0.462947
[LSTMAE] ep 380 tr=0.050465 va=0.506958
Saved: C:\engine_module_pipeline\infer_stage\artifacts\model_lstm_long_best.pt C:\engine_module_pipeline\infer_stage\artifacts\model_lstm_long_best_torchscript.

In [35]:
# T-5 — Isolation Forest on scaled rows (global, not per-feature)

iso = IsolationForest(
    n_estimators=400,
    max_samples="auto",
    contamination="auto",
    random_state=42,
    n_jobs=-1,
)
iso.fit(X_train_s)
ISOF_PATH = ARTIFACTS / "isolation_forest_combiner_final.joblib"
joblib.dump(iso, ISOF_PATH)
print("Saved:", ISOF_PATH)


Saved: C:\engine_module_pipeline\infer_stage\artifacts\isolation_forest_combiner_final.joblib


In [36]:
# T-6 — KDE & GMM per-feature with Windows-safe filenames + index JSONs

KDE_BW     = 0.2
GMM_MAX_K  = 5

kde_index = {}
gmm_index = {}

for j, feat in enumerate(features):
    v = X_train_s[:, j].reshape(-1, 1)

    # KDE
    kde = KernelDensity(kernel="gaussian", bandwidth=KDE_BW)
    kde.fit(v)
    kde_fname = f"kde_{sanitize_for_filename(feat)}.joblib"
    joblib.dump(kde, ARTIFACTS / kde_fname)
    kde_index[feat] = kde_fname

    # GMM (best by BIC)
    best_bic = np.inf; best_gmm = None
    for k in range(1, GMM_MAX_K+1):
        gmm = GaussianMixture(n_components=k, covariance_type="full", random_state=42)
        gmm.fit(v)
        bic = gmm.bic(v)
        if bic < best_bic:
            best_bic, best_gmm = bic, gmm
    gmm_fname = f"gmm_{sanitize_for_filename(feat)}.joblib"
    joblib.dump(best_gmm, ARTIFACTS / gmm_fname)
    gmm_index[feat] = gmm_fname

# Index JSONs (canonical feature -> actual filename)
(ARTIFACTS / "kde_index.json").write_text(json.dumps(kde_index, indent=2), encoding="utf-8")
(ARTIFACTS / "gmm_index.json").write_text(json.dumps(gmm_index, indent=2), encoding="utf-8")
print("Saved KDE/GMM indices (25/25).")


Saved KDE/GMM indices (25/25).


In [37]:
# T-7 — Baselines (median/MAD + mean/std) and runtime manifest

import numpy as np, json, joblib, pandas as pd, torch
from pathlib import Path

def robust_stats(a):
    a = np.asarray(a).ravel()
    a = a[np.isfinite(a)]
    if a.size == 0:
        return {"median": 0.0, "mad": 1.0, "mean": 0.0, "std": 1.0}
    med = float(np.median(a))
    mad = float(np.median(np.abs(a - med)) + 1e-9)
    mu  = float(np.mean(a))
    sd  = float(np.std(a) + 1e-9)
    return {"median": med, "mad": mad, "mean": mu, "std": sd}

# Dense recon errors on val (TorchScript)
dense_cpu = torch.jit.load(str(ARTIFACTS / "model_dense_best_torchscript.pt"), map_location="cpu").eval()
with torch.no_grad():
    xt = torch.from_numpy(X_val_s).float()
    recon = dense_cpu(xt).numpy()
dense_err = ((X_val_s - recon)**2).mean(axis=1)

# LSTM last-step recon errors on val (TorchScript)
lstm_cpu = torch.jit.load(str(ARTIFACTS / "model_lstm_long_best_torchscript.pt"), map_location="cpu").eval()

def make_seq(X, seq=10):
    if len(X) < seq: return np.empty((0,seq,X.shape[1]), dtype=np.float32)
    out = []
    for i in range(seq-1, len(X)):
        out.append(X[i-seq+1:i+1])
    return np.stack(out, axis=0).astype(np.float32)

Xva_seq = make_seq(X_val_s, 10)
if len(Xva_seq) > 0:
    with torch.no_grad():
        yhat = lstm_cpu(torch.from_numpy(Xva_seq).float()).numpy()
    y = Xva_seq[:,-1,:]
    lstm_err = ((y - yhat)**2).mean(axis=1)
else:
    lstm_err = np.array([])

# Isolation Forest decision function
iso_scores = joblib.load(ARTIFACTS / "isolation_forest_combiner_final.joblib").decision_function(X_val_s)

# KDE/GMM aggregate log-prob — NOTE: GMM uses score_samples now (not score)
def aggregate_logp(index_name, kind):
    idx = json.loads((ARTIFACTS / index_name).read_text(encoding="utf-8"))
    vals = []
    for r in range(len(X_val_s)):
        s, cnt = 0.0, 0
        for j, feat in enumerate(features):
            fname = idx.get(feat)
            if not fname:
                continue
            model = joblib.load(ARTIFACTS / fname)
            x = float(X_val_s[r, j])
            try:
                if kind == "kde":
                    s += float(model.score_samples([[x]])[0]); cnt += 1
                else:  # gmm
                    # CRITICAL FIX: per-sample log-likelihood, not averaged score()
                    s += float(model.score_samples([[x]])[0]); cnt += 1
            except Exception:
                # swallow per-feature issues to keep aggregate robust
                pass
        vals.append(s if cnt > 0 else np.nan)
    return np.array(vals)

kde_lp = aggregate_logp("kde_index.json", "kde")
gmm_lp = aggregate_logp("gmm_index.json", "gmm")

# Baselines dictionary (used by inference robust normalization)
BASELINES = {
    "dense": robust_stats(dense_err),
    "lstm":  robust_stats(lstm_err),
    "isof":  robust_stats(iso_scores),
    "kde":   robust_stats(kde_lp),
    "gmm":   robust_stats(gmm_lp),
}
(ARTIFACTS / "scoring_summary.json").write_text(json.dumps(BASELINES, indent=2), encoding="utf-8")

manifest = {
    "created_utc": pd.Timestamp.utcnow().isoformat(),
    "features_canonical_order": features,
    "raw_to_canonical_map": RAW_TO_CANON,
    "scaler_file": "scaler_robust.joblib",
    "dense_torchscript": "model_dense_best_torchscript.pt",
    "dense_state_dict": "model_dense_best_state_dict.pt",
    "lstm_torchscript": "model_lstm_long_best_torchscript.pt",
    "lstm_state_dict_compat": "model_lstm_long_best.pt",  # compatibility filename
    "isolation_forest_joblib": "isolation_forest_combiner_final.joblib",
    "kde_index_file": "kde_index.json",
    "gmm_index_file": "gmm_index.json",
    "filename_sanitizer": {"colon": "__COLON__"},  # unchanged
    "model_input_contract_file": "model_input_contract.json"
}
(ARTIFACTS / "runtime_manifest.json").write_text(json.dumps(manifest, indent=2), encoding="utf-8")
print("Saved baselines and manifest.")


Saved baselines and manifest.


In [38]:
# T-8 — quick sanity
print("Artifacts created in:", ARTIFACTS)
print("Files:", sorted([p.name for p in ARTIFACTS.iterdir()]))

print("\nDense val MSE ~", float(np.mean(dense_err)) if len(dense_err) else None)
print("LSTM  val MSE ~", float(np.mean(lstm_err)) if len(lstm_err) else None)
print("IF    decision_function stats:", {
    "min": float(np.nanmin(iso_scores)), "max": float(np.nanmax(iso_scores)),
    "mean": float(np.nanmean(iso_scores))
})
print("KDE   agg logp ~", float(np.nanmean(kde_lp)) if np.isfinite(kde_lp).any() else None)
print("GMM   agg logp ~", float(np.nanmean(gmm_lp)) if np.isfinite(gmm_lp).any() else None)


Artifacts created in: C:\engine_module_pipeline\infer_stage\artifacts
Files: ['dense_loss.png', 'features.json', 'gmm_Air_Fuel_Ratio_Commanded___COLON__1.joblib', 'gmm_Air_Fuel_Ratio_Measured___COLON__1.joblib', 'gmm_Catalyst_Temperature__Bank_1_Sensor_1.joblib', 'gmm_Catalyst_Temperature__Bank_1_Sensor_2.joblib', 'gmm_ECU_7EA__COLON___Engine_Coolant_Temperature.joblib', 'gmm_ECU_7EA__COLON___Intake_Air_Temperature.joblib', 'gmm_ECU_7EB__COLON___Ambient_air_temp.joblib', 'gmm_ECU_7EB__COLON___Engine_Load_pct.joblib', 'gmm_ECU_7EB__COLON___Engine_RPM_rpm.joblib', 'gmm_ECU_7EB__COLON___Speed__OBD_km_h.joblib', 'gmm_Engine_Load_Absolute_pct.joblib', 'gmm_Engine_Oil_Temperature.joblib', 'gmm_Engine_RPM_rpm.joblib', 'gmm_Engine_kW__At_the_wheels_kW.joblib', 'gmm_Fuel_Trim_Bank_1_Long_Term_pct.joblib', 'gmm_Fuel_Trim_Bank_1_Short_Term_pct.joblib', 'gmm_Fuel_flow_rate_hour_l_hr.joblib', 'gmm_Mass_Air_Flow_Rate_g_s.joblib', 'gmm_O2_Bank_1_Sensor_2_Voltage_V.joblib', 'gmm_O2_Sensor1_Wide_Range_