# Phase 5: LSTM Autoencoder Training (Kaggle Self-Contained)

**Objective**: Train the 64-32-64 LSTM Autoencoder on the Physics-Enriched Tensor.
**Refinements**: 
*   Latent Non-linearity (`LeakyReLU`)
*   Gradient Clipping (for Linear Output stability)
*   MAE Loss (L1)
*   **OOM Fix**: Lazy Loading & Validation Batching

**Inputs**: 
*   `train.parquet` (Upload to Kaggle Dataset)
*   `val.parquet` (Upload to Kaggle Dataset)

**Outputs**:
*   `lstm_ae_champion.pth`
*   Loss Curves



In [None]:
import sys
import os

# --- Auto-Detection: Local vs Kaggle ---
sys.path.append(os.path.abspath(".."))
try:
    from src import config
    IS_LOCAL_ENV = True
    print("✅ Environment Detected: LOCAL (Project Structure Found)")
    RESULTS_DIR = "results/lstm"
    DATA_DIR = getattr(config, "BUCKET_GOLD", "gold") # Or handle S3 path logic here
    if DATA_DIR.startswith("s3://"):
        # Simplified for now, assuming local execution uses MinIO or local/gold
        # Unless using pandas storage_options, we might fallback to local path if s3 lookup fails
        pass
except ImportError:
    IS_LOCAL_ENV = False
    print("🌍 Environment Detected: KAGGLE (Standalone Kernel)")
    RESULTS_DIR = "/kaggle/working"
    DATA_DIR = "/kaggle/input/anomalydetection4pharma-gold-tensor"

os.makedirs(RESULTS_DIR, exist_ok=True)
print(f"Using Results Dir: {RESULTS_DIR}")

# Config
WINDOW_SIZE = 60  # 10 minutes
STRIDE = 5        # Overlap
BATCH_SIZE = 64
EPOCHS = 50
LR = 1e-3
LATENT_DIM = 3
HIDDEN_DIM = 64

# Features Expected in Parquet
TENSOR_FEATURES = [
    "dynamic_tensile_strength",
    "ejection",
    "tbl_speed",
    "cyl_main",
    "tbl_fill"
]

## 1. Helper Functions (Embedded)
We paste the source code here to avoid uploading `src` folders.



In [None]:
# From src/data/window_loader.py
def create_windows(df, window_size=60, stride=5, feature_cols=None):
    """
    Creates sequences of length `window_size` from a DataFrame.
    """
    if feature_cols is None:
        raise ValueError("Must provide feature_cols list")
        
    X = []
    
    # If 'batch' column exists, we must not window across batch boundaries.
    if 'batch' in df.columns:
        grouped = df.groupby('batch')
        for batch_id, group in grouped:
            group = group.sort_values('timestamp')
            data = group[feature_cols].values
            num_samples = len(data)
            if num_samples < window_size:
                continue
            for i in range(0, num_samples - window_size + 1, stride):
                window = data[i : i + window_size]
                X.append(window)
    else:
        data = df[feature_cols].values
        num_samples = len(data)
        for i in range(0, num_samples - window_size + 1, stride):
            window = data[i : i + window_size]
            X.append(window)
            
    return np.array(X)

In [None]:
# From src/models/lstm_ae.py
class LSTMAutoencoder(nn.Module):
    """
    LSTM Autoencoder for Anomaly Detection in Time Series (PyTorch Version).
    Architecture:
        Input(Window) -> Encoder(LSTM) -> Latent(Vector) -> Decoder(LSTM) -> Output(Window)
    """
    
    def __init__(self, window_size, n_features, latency_dim=3, hidden_dim=64, num_layers=1, dropout=0.2):
        super(LSTMAutoencoder, self).__init__()
        self.window_size = window_size
        self.n_features = n_features
        self.latency_dim = latency_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        # Encoder
        self.encoder = nn.LSTM(
            input_size=n_features, 
            hidden_size=hidden_dim, 
            num_layers=num_layers, 
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )
        self.dropout_layer = nn.Dropout(dropout)
        
        # Latent compression
        self.to_latent = nn.Linear(hidden_dim, latency_dim)
        self.from_latent = nn.Linear(latency_dim, hidden_dim)
        
        # Decoder
        self.decoder = nn.LSTM(
            input_size=hidden_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )
        
        # Output layer
        self.output_layer = nn.Linear(hidden_dim, n_features)
        
        # Initialize Weights
        self._init_weights()
        self.to(self.device)

    def _init_weights(self):
        """Xavier Initialization for better convergence."""
        for name, param in self.named_parameters():
            if 'weight' in name:
                nn.init.xavier_normal_(param)
            elif 'bias' in name:
                nn.init.constant_(param, 0.0)

    def forward(self, x):
        # Encoder
        _, (hidden_n, _) = self.encoder(x)
        last_hidden = hidden_n[-1]
        last_hidden = self.dropout_layer(last_hidden) # Apply Dropout
        
        # Latent
        latent = self.to_latent(last_hidden)
        latent = nn.functional.leaky_relu(latent) # Non-linearity for complex patterns
        
        # Decoder Prep
        hidden_restored = self.from_latent(latent)
        # Repeat for each time step
        repeated_hidden = hidden_restored.unsqueeze(1).repeat(1, self.window_size, 1)
        
        # Decoder
        dec_out, _ = self.decoder(repeated_hidden)
        
        # Reconstruction
        reconstructed = self.output_layer(dec_out)
        return reconstructed, latent # Return latent for visualization

    def save_checkpoint(self, path, epoch, optimizer, loss):
        """Saves full checkpoint with metadata."""
        torch.save({
            'epoch': epoch,
            'model_state_dict': self.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss,
            'config': {
                'window_size': self.window_size,
                'n_features': self.n_features, 
                'hidden_dim': self.hidden_dim,
                'latent_dim': self.latency_dim
            }
        }, path)
        print(f"  💾 Checkpoint saved: {path}")

    def train_model(self, X_train, X_val, epochs=50, batch_size=64, lr=1e-3, 
                    patience=15, scheduler_patience=3, scheduler_factor=0.5, 
                    save_path="models/lstm_ae_champion.pth", noise_factor=0.0):
        """
        Trains the model with Limit-aware Learning Rate Scheduler and Early Stopping.
        Uses Lazy Loading and Validation Batching to prevent OOM.
        """
        # Convert to Tensor but KEEP ON CPU initially
        train_tensor = torch.from_numpy(X_train.astype(np.float32))
        val_tensor = torch.from_numpy(X_val.astype(np.float32))
        
        train_loader = DataLoader(TensorDataset(train_tensor, train_tensor), batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(TensorDataset(val_tensor, val_tensor), batch_size=batch_size, shuffle=False)
        
        criterion = nn.L1Loss()
        optimizer = optim.Adam(self.parameters(), lr=lr)
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, mode='min', factor=scheduler_factor, patience=scheduler_patience
        )
        
        best_loss = float('inf')
        counter = 0
        history = {'train_loss': [], 'val_loss': []}
        
        print(f"🚀 Device: {self.device}")
        
        for epoch in range(epochs):
            self.train()
            train_loss = 0
            
            for batch_x, target_x in train_loader:
                # Move to GPU Just-In-Time
                batch_x = batch_x.to(self.device)
                target_x = target_x.to(self.device)

                # Apply Denoising Noise (if enabled)
                if noise_factor > 0:
                    noise = torch.randn_like(batch_x) * noise_factor
                    batch_x = batch_x + noise
                
                optimizer.zero_grad()
                output, _ = self(batch_x)
                loss = criterion(output, target_x)
                loss.backward()
                nn.utils.clip_grad_norm_(self.parameters(), max_norm=1.0)
                optimizer.step()
                train_loss += loss.item()
            
            avg_train_loss = train_loss / len(train_loader)
            
            # Validation (Batched)
            self.eval()
            val_loss_accum = 0
            with torch.no_grad():
                for val_batch, _ in val_loader:
                   val_batch = val_batch.to(self.device)
                   val_out, _ = self(val_batch)
                   batch_loss = criterion(val_out, val_batch).item()
                   val_loss_accum += batch_loss
            
            avg_val_loss = val_loss_accum / len(val_loader)
            
            history['train_loss'].append(avg_train_loss)
            history['val_loss'].append(avg_val_loss)
            
            print(f"Epoch [{epoch+1}/{epochs}] Train Loss: {avg_train_loss:.6f} | Val Loss: {avg_val_loss:.6f}")
            
            # Update Scheduler
            scheduler.step(avg_val_loss)

            # Checkpoint
            if avg_val_loss < best_loss:
                best_loss = avg_val_loss
                self.save_checkpoint(save_path, epoch, optimizer, best_loss)
                counter = 0
            else:
                counter += 1
                if counter >= patience:
                    print(f"🛑 Early Stopping triggered after {patience} epochs without improvement.")
                    break
                    
        return history

    def predict(self, X):
        self.eval()
        tensor_X = torch.from_numpy(X.astype(np.float32)).to(self.device)
        dataset = TensorDataset(tensor_X)
        loader = DataLoader(dataset, batch_size=256, shuffle=False)
        
        predictions = []
        with torch.no_grad():
            for batch in loader:
                batch_x = batch[0]
                out, _ = self(batch_x)
                predictions.append(out.cpu().numpy())
                
        return np.concatenate(predictions, axis=0)
    
    def get_reconstruction_error(self, X):
        """
        Calculates MSE for each sample window.
        Returns: Array of shape (n_samples,) containing the MSE score.
        """
        X_pred = self.predict(X)
        mse = np.mean(np.square(X - X_pred), axis=(1, 2))
        return mse

## 2. Load Gold Data
Look for data in Kaggle input directories or current directory.



In [None]:
# Search for parquet files or use S3
def load_parquet_data(filename, data_dir, is_local=False):
    # Use localized configuration if available
    storage_options = None
    if is_local:
        storage_options = config.get_pandas_storage_options()
        
    path = f"{data_dir}/{filename}"
    if not path.startswith("s3://") and not os.path.exists(path):
         # Checking alternate local paths if not S3
         possible_paths = [
            filename,
            f"../input/{filename}",
            f"../input/anomalydetection4pharma-gold-tensor/{filename}",
            f"../gold/{filename}"
        ]
         for p in possible_paths:
            if os.path.exists(p):
                path = p
                break
    
    print(f"Loading {filename} from {path}...")
    return pd.read_parquet(path, storage_options=storage_options)

try:
    df_train = load_parquet_data("train.parquet", DATA_DIR, IS_LOCAL_ENV).sort_values('timestamp')
    df_val = load_parquet_data("val.parquet", DATA_DIR, IS_LOCAL_ENV).sort_values('timestamp')
    
    print(f"Train Loaded: {len(df_train)} rows")
    print(f"Val Loaded:   {len(df_val)} rows")
except Exception as e:
    print(f"Error loading data: {e}")

## 3. Generate Windows & Train



In [None]:
history = model.train_model(
    X_train, 
    X_val, 
    epochs=EPOCHS, 
    batch_size=BATCH_SIZE, 
    lr=LR,
    noise_factor=0.05,
    save_path=os.path.join(RESULTS_DIR, "lstm_ae_champion.pth")
)

In [None]:
# 1. Guardar el Historial de Entrenamiento (CSV)
history_df = pd.DataFrame(history)
history_df.to_csv(os.path.join(RESULTS_DIR, "training_history.csv"), index=False)
print(f"✅ Historial guardado: {RESULTS_DIR}/training_history.csv")

# 2. Guardar Metadata del Modelo (Configuración)
# Para poder cargarlo luego sin recordar los parámetros exactos
model_config = {
    "window_size": WINDOW_SIZE,
    "n_features": len(TENSOR_FEATURES),
    "hidden_dim": HIDDEN_DIM,
    "latent_dim": LATENT_DIM,
    "features": TENSOR_FEATURES
}
with open(os.path.join(RESULTS_DIR, "model_config.pkl"), "wb") as f:
    pickle.dump(model_config, f)
print(f"✅ Configuración guardada: {RESULTS_DIR}/model_config.pkl")

# 3. Guardar Gráfico de Loss (Imagen)
plt.figure(figsize=(10, 6))
plt.plot(history['train_loss'], label='Train MAE', color='blue')
plt.plot(history['val_loss'], label='Val MAE', color='orange')
plt.title("Curva de Aprendizaje - LSTM Autoencoder")
plt.xlabel("Epoch")
plt.ylabel("Loss (MAE)")
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig(os.path.join(RESULTS_DIR, "loss_curve.png"))
print(f"✅ Gráfico guardado: {RESULTS_DIR}/loss_curve.png")

print("\n📦 ARCHIVOS LISTOS:")
print("1. lstm_ae_champion.pth")
print("2. model_config.pkl")
print("3. loss_curve.png")
print("4. training_history.csv")