In [4]:
import os, json, random
from pathlib import Path
from typing import Dict, Any

import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import joblib

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", DEVICE)


SMALL_CSV = "Book1.csv"   
BIG_CSV   = "Book1.csv"  

ARTIFACTS_DIR = Path("artifacts"); ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)
MODELS_DIR = Path("Models"); MODELS_DIR.mkdir(parents=True, exist_ok=True)

FEATURES = [f"signal_{i}" for i in range(1, 61)]
TIMESTAMP_COL = "Timestamp"  

SEQ_LEN = 30
STRIDE  = 1


PARAM_SETS = [
    {
        "id": "artifact_1",
        "dense": {"lr": 1e-3, "batch": 512, "epochs": 40, "patience": 5, "weight_decay": 0.0},
        "lstm":  {"lr": 1e-3, "batch": 128, "epochs": 40, "patience": 5, "hidden": 64, "latent": 16, "layers": 1},
    },
    {
        "id": "artifact_2",
        "dense": {"lr": 5e-4, "batch": 512, "epochs": 50, "patience": 6, "weight_decay": 1e-5},
        "lstm":  {"lr": 5e-4, "batch": 128, "epochs": 50, "patience": 6, "hidden": 96, "latent": 24, "layers": 1},
    },
    {
        "id": "artifact_3",
        "dense": {"lr": 1e-3, "batch": 1024, "epochs": 35, "patience": 5, "weight_decay": 1e-6},
        "lstm":  {"lr": 1e-3, "batch": 256, "epochs": 35, "patience": 5, "hidden": 80, "latent": 20, "layers": 1},
    },
    {
        "id": "artifact_4",
        "dense": {"lr": 2e-3, "batch": 512, "epochs": 30, "patience": 4, "weight_decay": 0.0},
        "lstm":  {"lr": 2e-3, "batch": 128, "epochs": 30, "patience": 4, "hidden": 64, "latent": 16, "layers": 2},
    },
    
    {
         "id": "artifact_5",
         "dense": {"lr": 7e-4, "batch": 512, "epochs": 45, "patience": 5, "weight_decay": 5e-6},
         "lstm":  {"lr": 7e-4, "batch": 128, "epochs": 45, "patience": 5, "hidden": 72, "latent": 18, "layers": 1},
    },
]



def load_signals_csv(path, features=FEATURES, timestamp_col=TIMESTAMP_COL):
    cols_in_file = pd.read_csv(path, nrows=0).columns.tolist()
    usecols = [c for c in features if c in cols_in_file] + ([timestamp_col] if timestamp_col in cols_in_file else [])
    df = pd.read_csv(path, usecols=usecols)
    
    for f in features:
        if f not in df.columns:
            df[f] = 0.0
    X = df[features].astype("float32").values
    return X

class TabularDataset(Dataset):
    def __init__(self, X):
        self.X = np.asarray(X, dtype=np.float32)
    def __len__(self): return len(self.X)
    def __getitem__(self, idx): return torch.from_numpy(self.X[idx])

class SequenceDataset(Dataset):
    def __init__(self, X_scaled, seq_len=30, stride=1):
        self.seq_len = seq_len; self.stride = stride
        X = np.asarray(X_scaled, dtype=np.float32)
        n = len(X)
        if n < seq_len:
            self.X_seq = np.empty((0, seq_len, X.shape[1]), dtype=np.float32)
        else:
            num = (n - seq_len) // stride + 1
            self.X_seq = np.stack([X[i*stride:i*stride+seq_len] for i in range(num)], axis=0)
    def __len__(self): return len(self.X_seq)
    def __getitem__(self, idx): return torch.from_numpy(self.X_seq[idx])

class DenseAE(nn.Module):
    def __init__(self, in_dim=60):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(in_dim, 40), nn.ReLU(),
            nn.Linear(40, 20), nn.ReLU(),
            nn.Linear(20, 10), nn.ReLU(),
        )
        self.decoder = nn.Sequential(
            nn.Linear(10, 20), nn.ReLU(),
            nn.Linear(20, 40), nn.ReLU(),
            nn.Linear(40, in_dim),
        )
    def forward(self, x):
        z = self.encoder(x)
        out = self.decoder(z)
        return out

class LSTMAE(nn.Module):
    def __init__(self, input_dim=60, hidden_dim=64, latent_dim=16, num_layers=1):
        super().__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.latent_dim = latent_dim
        self.num_layers = num_layers

        self.encoder = nn.LSTM(input_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.to_latent = nn.Linear(hidden_dim, latent_dim)

        self.from_latent = nn.Linear(latent_dim, hidden_dim)
        self.decoder = nn.LSTM(input_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.out = nn.Linear(hidden_dim, input_dim)

    def forward(self, x):  
        enc_out, _ = self.encoder(x)          
        h_last = enc_out[:, -1, :]            
        z = self.to_latent(h_last)            

        B = x.size(0)
        base = self.from_latent(z)            
        h0 = base.unsqueeze(0).repeat(self.num_layers, 1, 1)  
        c0 = torch.zeros(self.num_layers, B, self.hidden_dim, device=x.device, dtype=x.dtype)

        dec_out, _ = self.decoder(x, (h0, c0))  
        y = self.out(dec_out)                   
        return y


def make_loader(ds, batch_size=256, shuffle_flag=True):
    return DataLoader(ds, batch_size=batch_size, shuffle=shuffle_flag,
                      drop_last=False, num_workers=0, pin_memory=True)

def train_one_dense(params: Dict[str, Any], Xtr, Xval):
    model = DenseAE(in_dim=60).to(DEVICE)
    opt = torch.optim.AdamW(model.parameters(), lr=params["lr"],
                            weight_decay=params.get("weight_decay", 0.0))
    crit = nn.MSELoss()
    tr_loader = make_loader(TabularDataset(Xtr), batch_size=params["batch"], shuffle_flag=True)
    va_loader = make_loader(TabularDataset(Xval), batch_size=params["batch"], shuffle_flag=False)
    patience = int(params.get("patience", 5))
    epochs = int(params.get("epochs", 50))
    best_val = float("inf"); best_state = None; no_imp = 0
    hist = {"train": [], "val": []}

    for epoch in range(epochs):
        model.train(); tr_sum=0.0; n_tr=0
        for xb in tr_loader:
            xb = xb.to(DEVICE)
            opt.zero_grad(set_to_none=True)
            recon = model(xb)
            loss = crit(recon, xb)
            loss.backward(); opt.step()
            tr_sum += loss.item()*xb.size(0); n_tr += xb.size(0)
        tr_loss = tr_sum / max(1, n_tr)
        model.eval(); va_sum=0.0; n_va=0
        with torch.no_grad():
            for xb in va_loader:
                xb = xb.to(DEVICE)
                recon = model(xb); loss = crit(recon, xb)
                va_sum += loss.item()*xb.size(0); n_va += xb.size(0)
        va_loss = va_sum / max(1, n_va)
        hist["train"].append(float(tr_loss)); hist["val"].append(float(va_loss))
        if va_loss < best_val - 1e-8:
            best_val = va_loss
            best_state = {k: v.detach().cpu().clone() for k,v in model.state_dict().items()}
            no_imp = 0
        else:
            no_imp += 1
            if no_imp >= patience: break
    if best_state is not None: model.load_state_dict(best_state)
    return model, float(best_val), hist

def train_one_lstm(params: Dict[str, Any], Xtr, Xval, seq_len=30, stride=1):
    model = LSTMAE(input_dim=60,
                   hidden_dim=int(params.get("hidden",64)),
                   latent_dim=int(params.get("latent",16)),
                   num_layers=int(params.get("layers",1))).to(DEVICE)
    opt = torch.optim.AdamW(model.parameters(), lr=params["lr"])
    crit = nn.MSELoss()
    tr_loader = make_loader(SequenceDataset(Xtr, seq_len=seq_len, stride=stride),
                            batch_size=params["batch"], shuffle_flag=True)
    va_seq_ds = SequenceDataset(Xval, seq_len=seq_len, stride=stride)
    va_loader = make_loader(va_seq_ds, batch_size=params["batch"], shuffle_flag=False)
    patience = int(params.get("patience", 5))
    epochs = int(params.get("epochs", 50))
    best_val = float("inf"); best_state = None; no_imp = 0
    hist = {"train": [], "val": []}

    for epoch in range(epochs):
        model.train(); tr_sum=0.0; n_tr=0
        for xb in tr_loader:
            xb = xb.to(DEVICE)
            opt.zero_grad(set_to_none=True)
            recon = model(xb)
            loss = crit(recon, xb)
            loss.backward(); opt.step()
            tr_sum += loss.item()*xb.size(0); n_tr += xb.size(0)
        tr_loss = tr_sum / max(1, n_tr)
        model.eval(); va_sum=0.0; n_va=0
        with torch.no_grad():
            for xb in va_loader:
                xb = xb.to(DEVICE)
                recon = model(xb); loss = crit(recon, xb)
                va_sum += loss.item()*xb.size(0); n_va += xb.size(0)
        va_loss = va_sum / max(1, n_va)
        hist["train"].append(float(tr_loss)); hist["val"].append(float(va_loss))
        if va_loss < best_val - 1e-8:
            best_val = va_loss
            best_state = {k: v.detach().cpu().clone() for k,v in model.state_dict().items()}
            no_imp = 0
        else:
            no_imp += 1
            if no_imp >= patience: break
    if best_state is not None: model.load_state_dict(best_state)
    return model, float(best_val), hist

def dense_val_errors(model: nn.Module, Xval):
    """Return per-row MSE errors on validation (for thresholds)."""
    model.eval(); crit = nn.MSELoss(reduction="none")
    loader = make_loader(TabularDataset(Xval), batch_size=1024, shuffle_flag=False)
    errs = []
    with torch.no_grad():
        for xb in loader:
            xb = xb.to(DEVICE)
            recon = model(xb)
            se = (recon - xb)**2
            row_mse = se.mean(dim=1)  # (B,)
            errs.append(row_mse.detach().cpu().numpy())
    if errs:
        e = np.concatenate(errs, axis=0)
    else:
        e = np.zeros((0,), dtype=np.float32)
    return e

def lstm_val_errors(model: nn.Module, Xval, seq_len=30, stride=1):
    """Return per-window MSE errors on validation (averaged over T and F)."""
    model.eval(); crit = nn.MSELoss(reduction="none")
    ds = SequenceDataset(Xval, seq_len=seq_len, stride=stride)
    loader = make_loader(ds, batch_size=256, shuffle_flag=False)
    errs = []
    with torch.no_grad():
        for xb in loader:
            xb = xb.to(DEVICE)
            recon = model(xb)
            se = (recon - xb)**2
            win_mse = se.mean(dim=(1,2))  
            errs.append(win_mse.detach().cpu().numpy())
    if errs:
        e = np.concatenate(errs, axis=0)
    else:
        e = np.zeros((0,), dtype=np.float32)
    return e

def percentiles(arr, ps=(90,95,99)):
    if arr.size == 0:
        return {f"p{p}": None for p in ps}
    return {f"p{p}": float(np.percentile(arr, p)) for p in ps}


X_small = load_signals_csv(SMALL_CSV, FEATURES, TIMESTAMP_COL)
X_s_tr, X_s_va = train_test_split(X_small, test_size=0.2, random_state=SEED, shuffle=True)

scaler_small = MinMaxScaler()
Xs_tr = scaler_small.fit_transform(X_s_tr)
Xs_va = scaler_small.transform(X_s_va)


joblib.dump(scaler_small, ARTIFACTS_DIR/"scaler_small.joblib")


X_big = load_signals_csv(BIG_CSV, FEATURES, TIMESTAMP_COL)
Xb_tr, Xb_val = train_test_split(X_big, test_size=0.2, random_state=SEED, shuffle=True)

scaler_prod = MinMaxScaler()
Xb_tr_s = scaler_prod.fit_transform(Xb_tr)
Xb_val_s = scaler_prod.transform(Xb_val)

joblib.dump(scaler_prod, ARTIFACTS_DIR/"scaler_prod.joblib")
with open(ARTIFACTS_DIR/"features.json","w") as f:
    json.dump({"features": FEATURES, "timestamp_col": TIMESTAMP_COL}, f, indent=2)
with open(ARTIFACTS_DIR/"seq_config.json","w") as f:
    json.dump({"seq_len": int(SEQ_LEN), "stride": int(STRIDE)}, f, indent=2)

print("Saved:", (ARTIFACTS_DIR/"scaler_prod.joblib").resolve())
print("Saved:", (ARTIFACTS_DIR/"features.json").resolve())
print("Saved:", (ARTIFACTS_DIR/"seq_config.json").resolve())


training_report = {}
artifacts_index = []

json.dump(PARAM_SETS, open(ARTIFACTS_DIR/"param_sets.json","w"), indent=2)

for i, cfg in enumerate(PARAM_SETS, start=1):
    art_id = cfg.get("id", f"artifact_{i}")
    out_dir = MODELS_DIR / art_id
    out_dir.mkdir(parents=True, exist_ok=True)

    
    dense_model, dense_val, dense_hist = train_one_dense(cfg["dense"], Xb_tr_s, Xb_val_s)
    torch.save(dense_model.state_dict(), out_dir/"dense_ae.pt")

    
    d_errs = dense_val_errors(dense_model, Xb_val_s)
    d_thresh = percentiles(d_errs, ps=(90,95,99))

   
    lstm_model, lstm_val, lstm_hist = train_one_lstm(cfg["lstm"], Xb_tr_s, Xb_val_s,
                                                     seq_len=SEQ_LEN, stride=STRIDE)
    torch.save(lstm_model.state_dict(), out_dir/"lstm_ae.pt")

    l_errs = lstm_val_errors(lstm_model, Xb_val_s, seq_len=SEQ_LEN, stride=STRIDE)
    l_thresh = percentiles(l_errs, ps=(90,95,99))

   
    manifest = {
        "id": art_id,
        "dense_params": cfg["dense"],
        "lstm_params": cfg["lstm"],
        "input_dim": 60,
        "seq_len": int(SEQ_LEN),
        "stride": int(STRIDE),
        "val_losses": {"dense": float(dense_val), "lstm": float(lstm_val)},
        "files": {
            "dense_ae": str((out_dir/"dense_ae.pt").as_posix()),
            "lstm_ae": str((out_dir/"lstm_ae.pt").as_posix()),
        }
    }
    json.dump(manifest, open(out_dir/"manifest.json","w"), indent=2)

    thresholds = {
        "dense_row_mse": d_thresh, 
        "lstm_win_mse":  l_thresh,  
        "notes": "Thresholds are percentiles of validation reconstruction MSE. Use p95/p99 for anomaly flags; tune per stream."
    }
    json.dump(thresholds, open(out_dir/"thresholds.json","w"), indent=2)

    metrics = {
        "dense_loss_history": dense_hist,
        "lstm_loss_history": lstm_hist,
        "dense_val_errors_preview": [float(x) for x in d_errs[:1000]],  
        "lstm_val_errors_preview":  [float(x) for x in l_errs[:1000]],
    }
    json.dump(metrics, open(out_dir/"metrics.json","w"), indent=2)

    training_report[art_id] = {
        "dense": {"best_val": float(dense_val)},
        "lstm":  {"best_val": float(lstm_val)},
        "thresholds": {"dense": d_thresh, "lstm": l_thresh},
    }
    artifacts_index.append({
        "id": art_id,
        "path": str(out_dir.as_posix()),
        "dense_val": float(dense_val),
        "lstm_val": float(lstm_val),
        "dense_pt": str((out_dir/"dense_ae.pt").as_posix()),
        "lstm_pt": str((out_dir/"lstm_ae.pt").as_posix()),
    })


json.dump(training_report, open(ARTIFACTS_DIR/"training_report.json","w"), indent=2)
json.dump(artifacts_index, open(ARTIFACTS_DIR/"artifacts_index.json","w"), indent=2)


print("\nArtifacts saved in 'artifacts/':")
for p in [
    ARTIFACTS_DIR/"scaler_small.joblib",
    ARTIFACTS_DIR/"scaler_prod.joblib",
    ARTIFACTS_DIR/"features.json",
    ARTIFACTS_DIR/"seq_config.json",
    ARTIFACTS_DIR/"param_sets.json",
    ARTIFACTS_DIR/"training_report.json",
    ARTIFACTS_DIR/"artifacts_index.json",
]:
    print(" -", p.resolve())

print("\nModel variants saved under 'Models/':")
for item in artifacts_index:
    print(f" - {item['id']}: {item['path']}")


Using device: cpu
Saved: C:\Users\Ishaan Tiwari\Desktop\Kafka\artifacts\scaler_prod.joblib
Saved: C:\Users\Ishaan Tiwari\Desktop\Kafka\artifacts\features.json
Saved: C:\Users\Ishaan Tiwari\Desktop\Kafka\artifacts\seq_config.json





Artifacts saved in 'artifacts/':
 - C:\Users\Ishaan Tiwari\Desktop\Kafka\artifacts\scaler_small.joblib
 - C:\Users\Ishaan Tiwari\Desktop\Kafka\artifacts\scaler_prod.joblib
 - C:\Users\Ishaan Tiwari\Desktop\Kafka\artifacts\features.json
 - C:\Users\Ishaan Tiwari\Desktop\Kafka\artifacts\seq_config.json
 - C:\Users\Ishaan Tiwari\Desktop\Kafka\artifacts\param_sets.json
 - C:\Users\Ishaan Tiwari\Desktop\Kafka\artifacts\training_report.json
 - C:\Users\Ishaan Tiwari\Desktop\Kafka\artifacts\artifacts_index.json

Model variants saved under 'Models/':
 - artifact_1: Models/artifact_1
 - artifact_2: Models/artifact_2
 - artifact_3: Models/artifact_3
 - artifact_4: Models/artifact_4
 - artifact_5: Models/artifact_5
