In [1]:
# %% [markdown]
# # PyTorch Autoencoder Training (Dense + LSTM) — Artifact Ready
# 
# This notebook trains tabular (Dense AE) and sequence (LSTM AE) autoencoders
# and saves all artifacts needed for streaming inference:
# - artifacts/scaler_small.joblib
# - artifacts/scaler_prod.joblib
# - artifacts/dense_ae.pt
# - artifacts/lstm_ae.pt
# - artifacts/best_params.json (includes seq_len, stride)
# - artifacts/features.json
# - artifacts/training_report.json
# 
# Edit SMALL_CSV / BIG_CSV paths as needed.

# %%
# 1) Imports & Global Config
import os, json, random
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import joblib

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# Reproducibility
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", DEVICE)

# Paths
SMALL_CSV = "Book1.csv"   # small subset for hyperparam search (~50k–100k rows)
BIG_CSV   = "Book1.csv"   # big dataset for final training (~600k)

ARTIFACTS_DIR = Path("artifacts")
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)

FEATURES = [f"signal_{i}" for i in range(1, 61)]
TIMESTAMP_COL = "Timestamp"  # optional in CSV

# Sequence defaults
SEQ_LEN = 30
STRIDE  = 1

# %%
# 2) Helpers: IO, datasets, models, training utilities

def load_signals_csv(path, features=FEATURES, timestamp_col=TIMESTAMP_COL):
    cols_in_file = pd.read_csv(path, nrows=0).columns.tolist()
    usecols = [c for c in features if c in cols_in_file] + ([timestamp_col] if timestamp_col in cols_in_file else [])
    df = pd.read_csv(path, usecols=usecols)
    # Ensure full feature set in correct order (fill missing with 0.0 if any)
    for f in features:
        if f not in df.columns:
            df[f] = 0.0
    X = df[features].astype("float32").values
    return X

class TabularDataset(Dataset):
    def __init__(self, X):
        self.X = np.asarray(X, dtype=np.float32)
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx])

class SequenceDataset(Dataset):
    def __init__(self, X_scaled, seq_len=30, stride=1):
        self.seq_len = seq_len; self.stride = stride
        X = np.asarray(X_scaled, dtype=np.float32)
        n = len(X)
        if n < seq_len:
            self.X_seq = np.empty((0, seq_len, X.shape[1]), dtype=np.float32)
        else:
            num = (n - seq_len) // stride + 1
            self.X_seq = np.stack([X[i*stride:i*stride+seq_len] for i in range(num)], axis=0)
    def __len__(self):
        return len(self.X_seq)
    def __getitem__(self, idx):
        return torch.from_numpy(self.X_seq[idx])

class DenseAE(nn.Module):
    def __init__(self, in_dim=60):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(in_dim, 40), nn.ReLU(),
            nn.Linear(40, 20), nn.ReLU(),
            nn.Linear(20, 10), nn.ReLU(),
        )
        self.decoder = nn.Sequential(
            nn.Linear(10, 20), nn.ReLU(),
            nn.Linear(20, 40), nn.ReLU(),
            nn.Linear(40, in_dim),
        )
    def forward(self, x):
        z = self.encoder(x)
        out = self.decoder(z)
        return out

class LSTMAE(nn.Module):
    def __init__(self, input_dim=60, hidden_dim=64, latent_dim=16, num_layers=1):
        super().__init__()
        self.encoder = nn.LSTM(input_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.to_latent = nn.Linear(hidden_dim, latent_dim)
        self.from_latent = nn.Linear(latent_dim, hidden_dim)
        self.decoder = nn.LSTM(input_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.out = nn.Linear(hidden_dim, input_dim)
    def forward(self, x):  # x: (B, T, F)
        enc_out, _ = self.encoder(x)
        h_last = enc_out[:, -1, :]
        z = self.to_latent(h_last)
        h0 = self.from_latent(z).unsqueeze(0)
        c0 = torch.zeros_like(h0)
        dec_out, _ = self.decoder(x, (h0, c0))
        y = self.out(dec_out)
        return y

class EarlyStopper:
    def __init__(self, patience=5, min_delta=0.0):
        self.patience = patience; self.min_delta = min_delta
        self.best = float("inf"); self.count = 0
    def step(self, val):
        if val < self.best - self.min_delta:
            self.best = val; self.count = 0; return True
        else:
            self.count += 1; return False
    def should_stop(self):
        return self.count >= self.patience

def make_loader(ds, batch_size=256, shuffle_flag=True):
    return DataLoader(ds, batch_size=batch_size, shuffle=shuffle_flag, drop_last=False, num_workers=0, pin_memory=True)

# %%
# 3) Load SMALL CSV, split, fit scaler on TRAIN (small), build datasets
X_small = load_signals_csv(SMALL_CSV, FEATURES, TIMESTAMP_COL)
X_s_tr, X_s_va = train_test_split(X_small, test_size=0.2, random_state=SEED, shuffle=True)

scaler_small = MinMaxScaler()
Xs_tr = scaler_small.fit_transform(X_s_tr)
Xs_va = scaler_small.transform(X_s_va)

# Save small scaler for traceability
joblib.dump(scaler_small, ARTIFACTS_DIR/"scaler_small.joblib")

tr_ds_dense = TabularDataset(Xs_tr)
va_ds_dense = TabularDataset(Xs_va)

tr_ds_seq = SequenceDataset(Xs_tr, seq_len=SEQ_LEN, stride=STRIDE)
va_ds_seq = SequenceDataset(Xs_va, seq_len=SEQ_LEN, stride=STRIDE)

print("Small set shapes:", Xs_tr.shape, Xs_va.shape, "| LSTM windows:", len(tr_ds_seq), len(va_ds_seq))

# %%
# 4) Hyperparam search (small): Dense AE + LSTM AE
param_grid_dense = [
    {"lr": 1e-3, "batch": 512, "epochs": 50, "patience": 5, "weight_decay": 0.0},
    {"lr": 5e-4, "batch": 512, "epochs": 50, "patience": 5, "weight_decay": 1e-5},
]
param_grid_lstm = [
    {"lr": 1e-3, "batch": 128, "epochs": 50, "patience": 5, "hidden": 64, "latent": 16, "layers": 1},
    {"lr": 5e-4, "batch": 128, "epochs": 50, "patience": 5, "hidden": 96, "latent": 24, "layers": 1},
]

results = {"dense": [], "lstm": []}

# Dense grid
for p in param_grid_dense:
    model = DenseAE(in_dim=60).to(DEVICE)
    opt = torch.optim.AdamW(model.parameters(), lr=p["lr"], weight_decay=p.get("weight_decay", 0.0))
    crit = nn.MSELoss()
    tr_loader = make_loader(tr_ds_dense, batch_size=p["batch"], shuffle_flag=True)
    va_loader = make_loader(va_ds_dense, batch_size=p["batch"], shuffle_flag=False)
    es = EarlyStopper(patience=p["patience"], min_delta=0.0)

    best_val = float("inf"); best_state = None; no_improve = 0
    for epoch in range(p["epochs"]):
        model.train(); tr_sum = 0.0; n_tr = 0
        for xb in tr_loader:
            xb = xb.to(DEVICE); opt.zero_grad(set_to_none=True)
            recon = model(xb); loss = crit(recon, xb); loss.backward(); opt.step()
            tr_sum += loss.item()*xb.size(0); n_tr += xb.size(0)
        model.eval(); va_sum = 0.0; n_va = 0
        with torch.no_grad():
            for xb in va_loader:
                xb = xb.to(DEVICE)
                recon = model(xb); loss = crit(recon, xb)
                va_sum += loss.item()*xb.size(0); n_va += xb.size(0)
        va_loss = va_sum / max(1, n_va)
        if va_loss < best_val - 1e-8:
            best_val = va_loss
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
            no_improve = 0
        else:
            no_improve += 1
            if no_improve >= p["patience"]:
                break
    results["dense"].append({"params": p, "val_loss": best_val, "state": best_state})

# LSTM grid
for p in param_grid_lstm:
    model = LSTMAE(input_dim=60, hidden_dim=p["hidden"], latent_dim=p["latent"], num_layers=p["layers"]).to(DEVICE)
    opt = torch.optim.AdamW(model.parameters(), lr=p["lr"]) 
    crit = nn.MSELoss()
    tr_loader = make_loader(tr_ds_seq, batch_size=p["batch"], shuffle_flag=True)
    va_loader = make_loader(va_ds_seq, batch_size=p["batch"], shuffle_flag=False)
    es = EarlyStopper(patience=p["patience"], min_delta=0.0)

    best_val = float("inf"); best_state = None; no_improve = 0
    for epoch in range(p["epochs"]):
        model.train(); tr_sum = 0.0; n_tr = 0
        for xb in tr_loader:
            xb = xb.to(DEVICE)
            opt.zero_grad(set_to_none=True)
            recon = model(xb); loss = crit(recon, xb); loss.backward(); opt.step()
            tr_sum += loss.item()*xb.size(0); n_tr += xb.size(0)
        model.eval(); va_sum = 0.0; n_va = 0
        with torch.no_grad():
            for xb in va_loader:
                xb = xb.to(DEVICE)
                recon = model(xb); loss = crit(recon, xb)
                va_sum += loss.item()*xb.size(0); n_va += xb.size(0)
        va_loss = va_sum / max(1, n_va)
        if va_loss < best_val - 1e-8:
            best_val = va_loss
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
            no_improve = 0
        else:
            no_improve += 1
            if no_improve >= p["patience"]:
                break
    results["lstm"].append({"params": p, "val_loss": best_val, "state": best_state})

# %%
# 5) Select best configs; save best_params & small scaler path
best_dense = min(results["dense"], key=lambda d: d["val_loss"]) 
best_lstm  = min(results["lstm"],  key=lambda d: d["val_loss"]) 
print("Best Dense:", best_dense["params"], "val_loss:", best_dense["val_loss"]) 
print("Best LSTM :", best_lstm["params"],  "val_loss:", best_lstm["val_loss"]) 

with open(ARTIFACTS_DIR/"best_params.json","w") as f:
    json.dump({
        "dense": best_dense["params"],
        "lstm":  best_lstm["params"],
        "seq_len": SEQ_LEN,
        "stride":  STRIDE
    }, f, indent=2)
print("Saved:", (ARTIFACTS_DIR/"best_params.json").resolve())

# %%
# 6) Load BIG CSV, split, fit PRODUCTION scaler, save scaler & features
X_big = load_signals_csv(BIG_CSV, FEATURES, TIMESTAMP_COL)
Xb_tr, Xb_val = train_test_split(X_big, test_size=0.2, random_state=SEED, shuffle=True)

scaler_prod = MinMaxScaler()
Xb_tr_s = scaler_prod.fit_transform(Xb_tr)
Xb_val_s = scaler_prod.transform(Xb_val)

joblib.dump(scaler_prod, ARTIFACTS_DIR/"scaler_prod.joblib")
with open(ARTIFACTS_DIR/"features.json","w") as f:
    json.dump({"features": FEATURES, "timestamp_col": TIMESTAMP_COL}, f, indent=2)
print("Saved:", (ARTIFACTS_DIR/"scaler_prod.joblib").resolve())
print("Saved:", (ARTIFACTS_DIR/"features.json").resolve())

# %%
# 7) FINAL TRAIN — Dense AE on BIG set; save state_dict
pdense = best_dense["params"]
dense_final = DenseAE(in_dim=60).to(DEVICE)
opt = torch.optim.AdamW(dense_final.parameters(), lr=pdense["lr"], weight_decay=pdense.get("weight_decay", 0.0))
crit = nn.MSELoss()
tr_loader = make_loader(TabularDataset(Xb_tr_s), batch_size=pdense["batch"], shuffle_flag=True)
va_loader = make_loader(TabularDataset(Xb_val_s), batch_size=pdense["batch"], shuffle_flag=False)

best_val = float("inf"); best_state = None; patience = int(pdense.get("patience",5)); epochs = int(pdense.get("epochs",50)); no_improve = 0
for epoch in range(epochs):
    # train
    dense_final.train(); tr_sum=0.0; n_tr=0
    for xb in tr_loader:
        xb = xb.to(DEVICE); opt.zero_grad(set_to_none=True)
        recon = dense_final(xb); loss = crit(recon, xb); loss.backward(); opt.step()
        tr_sum += loss.item()*xb.size(0); n_tr += xb.size(0)
    # val
    dense_final.eval(); va_sum=0.0; n_va=0
    with torch.no_grad():
        for xb in va_loader:
            xb = xb.to(DEVICE); recon = dense_final(xb); loss = crit(recon, xb)
            va_sum += loss.item()*xb.size(0); n_va += xb.size(0)
    va_loss = va_sum / max(1, n_va)
    if va_loss < best_val - 1e-8:
        best_val = va_loss
        best_state = {k: v.detach().cpu().clone() for k, v in dense_final.state_dict().items()}
        no_improve = 0
    else:
        no_improve += 1
        if no_improve >= patience:
            break

if best_state is not None:
    dense_final.load_state_dict(best_state)
torch.save(dense_final.state_dict(), ARTIFACTS_DIR/"dense_ae.pt")
print("Saved:", (ARTIFACTS_DIR/"dense_ae.pt").resolve())

# Update training report
drep_path = ARTIFACTS_DIR/"training_report.json"
try:
    drep = json.load(open(drep_path))
except FileNotFoundError:
    drep = {}
drep["dense"] = {"best_val": float(best_val)}
json.dump(drep, open(drep_path, "w"), indent=2)

# %%
# 8) FINAL TRAIN — LSTM AE on BIG set; save state_dict
plstm = best_lstm["params"]
tr_seq = SequenceDataset(Xb_tr_s, seq_len=SEQ_LEN, stride=STRIDE)
va_seq = SequenceDataset(Xb_val_s, seq_len=SEQ_LEN, stride=STRIDE)

lstm_final = LSTMAE(input_dim=60, hidden_dim=int(plstm.get("hidden",64)),
                    latent_dim=int(plstm.get("latent",16)), num_layers=int(plstm.get("layers",1))).to(DEVICE)
opt = torch.optim.AdamW(lstm_final.parameters(), lr=plstm["lr"]) 
crit = nn.MSELoss()
tr_loader = make_loader(tr_seq, batch_size=plstm["batch"], shuffle_flag=True)
va_loader = make_loader(va_seq, batch_size=plstm["batch"], shuffle_flag=False)

best_val = float("inf"); best_state = None; patience = int(plstm.get("patience",5)); epochs = int(plstm.get("epochs",50)); no_improve = 0
for epoch in range(epochs):
    # train
    lstm_final.train(); tr_sum=0.0; n_tr=0
    for xb in tr_loader:
        xb = xb.to(DEVICE); opt.zero_grad(set_to_none=True)
        recon = lstm_final(xb); loss = crit(recon, xb); loss.backward(); opt.step()
        tr_sum += loss.item()*xb.size(0); n_tr += xb.size(0)
    # val
    lstm_final.eval(); va_sum=0.0; n_va=0
    with torch.no_grad():
        for xb in va_loader:
            xb = xb.to(DEVICE); recon = lstm_final(xb); loss = crit(recon, xb)
            va_sum += loss.item()*xb.size(0); n_va += xb.size(0)
    va_loss = va_sum / max(1, n_va)
    if va_loss < best_val - 1e-8:
        best_val = va_loss
        best_state = {k: v.detach().cpu().clone() for k, v in lstm_final.state_dict().items()}
        no_improve = 0
    else:
        no_improve += 1
        if no_improve >= patience:
            break

if best_state is not None:
    lstm_final.load_state_dict(best_state)
torch.save(lstm_final.state_dict(), ARTIFACTS_DIR/"lstm_ae.pt")
print("Saved:", (ARTIFACTS_DIR/"lstm_ae.pt").resolve())

# Update training report & ensure best_params includes seq
try:
    rep = json.load(open(drep_path))
except FileNotFoundError:
    rep = {}
rep["lstm"] = {"best_val": float(best_val), "seq_len": int(SEQ_LEN), "stride": int(STRIDE)}
json.dump(rep, open(drep_path, "w"), indent=2)

with open(ARTIFACTS_DIR/"best_params.json","w") as f:
    json.dump({
        "dense": best_dense["params"],
        "lstm":  best_lstm["params"],
        "seq_len": SEQ_LEN,
        "stride":  STRIDE
    }, f, indent=2)

# %%
# 9) Summary of saved artifacts
print("\nArtifacts saved:")
for p in [
    ARTIFACTS_DIR/"scaler_small.joblib",
    ARTIFACTS_DIR/"scaler_prod.joblib",
    ARTIFACTS_DIR/"dense_ae.pt",
    ARTIFACTS_DIR/"lstm_ae.pt",
    ARTIFACTS_DIR/"best_params.json",
    ARTIFACTS_DIR/"features.json",
    ARTIFACTS_DIR/"training_report.json",
]:
    print(" -", p.resolve())




Using device: cpu
Small set shapes: (799, 60) (200, 60) | LSTM windows: 770 171




Best Dense: {'lr': 0.001, 'batch': 512, 'epochs': 50, 'patience': 5, 'weight_decay': 0.0} val_loss: 0.02488909475505352
Best LSTM : {'lr': 0.001, 'batch': 128, 'epochs': 50, 'patience': 5, 'hidden': 64, 'latent': 16, 'layers': 1} val_loss: 0.022485879353351064
Saved: C:\Users\Ishaan Tiwari\Desktop\Kafka\artifacts\best_params.json
Saved: C:\Users\Ishaan Tiwari\Desktop\Kafka\artifacts\scaler_prod.joblib
Saved: C:\Users\Ishaan Tiwari\Desktop\Kafka\artifacts\features.json




Saved: C:\Users\Ishaan Tiwari\Desktop\Kafka\artifacts\dense_ae.pt
Saved: C:\Users\Ishaan Tiwari\Desktop\Kafka\artifacts\lstm_ae.pt

Artifacts saved:
 - C:\Users\Ishaan Tiwari\Desktop\Kafka\artifacts\scaler_small.joblib
 - C:\Users\Ishaan Tiwari\Desktop\Kafka\artifacts\scaler_prod.joblib
 - C:\Users\Ishaan Tiwari\Desktop\Kafka\artifacts\dense_ae.pt
 - C:\Users\Ishaan Tiwari\Desktop\Kafka\artifacts\lstm_ae.pt
 - C:\Users\Ishaan Tiwari\Desktop\Kafka\artifacts\best_params.json
 - C:\Users\Ishaan Tiwari\Desktop\Kafka\artifacts\features.json
 - C:\Users\Ishaan Tiwari\Desktop\Kafka\artifacts\training_report.json
