# LSTM Model Training

This notebook covers:
1. Loading preprocessed data
2. LSTM model architecture design
3. Training with MLflow tracking
4. Model evaluation and validation
5. Hyperparameter tuning
6. Model saving and registration

In [None]:
import warnings
from pathlib import Path

import matplotlib.pyplot as plt
import mlflow
import mlflow.pytorch
import numpy as np
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import mean_absolute_error, mean_squared_error
from torch.utils.data import DataLoader, TensorDataset

warnings.filterwarnings('ignore')

# Import our model and utilities
import sys

sys.path.append('../src')
from models.lstm_model import LSTMTimeSeriesModel

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## 1. Load Preprocessed Data

In [None]:
# Load processed data
data_dir = Path('../data/processed')

# Load sequences
X_train = np.load(data_dir / 'X_train.npy')
y_train = np.load(data_dir / 'y_train.npy')
X_val = np.load(data_dir / 'X_val.npy')
y_val = np.load(data_dir / 'y_val.npy')
X_test = np.load(data_dir / 'X_test.npy')
y_test = np.load(data_dir / 'y_test.npy')

print("Original data shapes:")
print(f"Training data: X={X_train.shape}, y={y_train.shape}")
print(f"Validation data: X={X_val.shape}, y={y_val.shape}")
print(f"Test data: X={X_test.shape}, y={y_test.shape}")

# Extract only the target variable (first feature) from y arrays
# The target arrays have shape (n_samples, 1, n_features), we want (n_samples, 1)
y_train = y_train[:, :, 0:1]  # Keep only first feature
y_val = y_val[:, :, 0:1]
y_test = y_test[:, :, 0:1]

print("\nAfter extracting target variable:")
print(f"Training data: X={X_train.shape}, y={y_train.shape}")
print(f"Validation data: X={X_val.shape}, y={y_val.shape}")
print(f"Test data: X={X_test.shape}, y={y_test.shape}")

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train).to(device)
y_train_tensor = torch.FloatTensor(y_train).to(device)
X_val_tensor = torch.FloatTensor(X_val).to(device)
y_val_tensor = torch.FloatTensor(y_val).to(device)
X_test_tensor = torch.FloatTensor(X_test).to(device)
y_test_tensor = torch.FloatTensor(y_test).to(device)

print("\nTensor shapes:")
print(f"X_train_tensor: {X_train_tensor.shape}")
print(f"y_train_tensor: {y_train_tensor.shape}")

## 2. Create Data Loaders

In [None]:
# Create datasets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# Create data loaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print(f"Training batches: {len(train_loader)}")
print(f"Validation batches: {len(val_loader)}")
print(f"Test batches: {len(test_loader)}")

## 3. Setup MLflow Tracking (Locally)

In [None]:
# Set MLflow tracking URI
mlflow.set_tracking_uri("./mlruns")

mlflow.set_experiment("lstm-time-series-forecasting")

print(f"MLflow tracking URI: {mlflow.get_tracking_uri()}")
print(f"Active experiment: {mlflow.get_experiment_by_name('lstm-time-series-forecasting')}")

## 4. Model Architecture and Training

In [None]:
def train_model(model, train_loader, val_loader, num_epochs=50, learning_rate=0.001):
    """
    Train LSTM model with MLflow tracking
    """
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    train_losses = []
    val_losses = []

    for epoch in range(num_epochs):
        # Training phase
        model.train()
        train_loss = 0.0

        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        train_loss /= len(train_loader)

        # Validation phase
        model.eval()
        val_loss = 0.0

        with torch.no_grad():
            for batch_X, batch_y in val_loader:
                outputs = model(batch_X)
                loss = criterion(outputs, batch_y)
                val_loss += loss.item()

        val_loss /= len(val_loader)

        train_losses.append(train_loss)
        val_losses.append(val_loss)

        # Log metrics to MLflow
        mlflow.log_metric("train_loss", train_loss, step=epoch)
        mlflow.log_metric("val_loss", val_loss, step=epoch)

        if epoch % 10 == 0:
            print(f'Epoch [{epoch}/{num_epochs}], Train Loss: {train_loss:.6f}, Val Loss: {val_loss:.6f}')

    return train_losses, val_losses

In [None]:
# Model hyperparameters
input_size = X_train.shape[2]  # Number of features (16)
hidden_size = 64
num_layers = 2
output_size = 1  # We're predicting only 1 value (the target variable)
dropout = 0.2
learning_rate = 0.001
num_epochs = 100
batch_size = 32

print("Model configuration:")
print(f"  Input size: {input_size} (number of input features)")
print(f"  Hidden size: {hidden_size}")
print(f"  Number of layers: {num_layers}")
print(f"  Output size: {output_size} (predicting 1 target value)")
print(f"  Dropout: {dropout}")
print(f"  Learning rate: {learning_rate}")
print(f"  Epochs: {num_epochs}")
print(f"  Batch size: {batch_size}")

print("\nData compatibility check:")
print(f"  Input sequences: {X_train.shape} -> Model expects: (batch, sequence, {input_size})")
print(f"  Target values: {y_train.shape} -> Model outputs: (batch, {output_size})")

In [None]:
# End any existing runs and start fresh
try:
    mlflow.end_run()
except:
    pass

# Start MLflow run
with mlflow.start_run(run_name="lstm-baseline") as run:
    # Log hyperparameters
    mlflow.log_params({
        "input_size": input_size,
        "hidden_size": hidden_size,
        "num_layers": num_layers,
        "output_size": output_size,
        "dropout": dropout,
        "learning_rate": learning_rate,
        "num_epochs": num_epochs,
        "batch_size": batch_size
    })

    # Initialize model
    model = LSTMTimeSeriesModel(
        input_size=input_size,
        hidden_size=hidden_size,
        num_layers=num_layers,
        output_size=output_size,
        dropout=dropout
    ).to(device)

    print(f"Model initialized with {sum(p.numel() for p in model.parameters()):,} parameters")

    # Test model with a small batch to verify shapes
    print("\nüîç Testing model forward pass...")
    model.eval()
    with torch.no_grad():
        # Take first batch from train loader
        test_batch_X, test_batch_y = next(iter(train_loader))
        print(f"Input batch shape: {test_batch_X.shape}")
        print(f"Target batch shape: {test_batch_y.shape}")

        # Forward pass
        test_output = model(test_batch_X)
        print(f"Model output shape: {test_output.shape}")

        # Check if shapes are compatible
        if test_output.shape[0] == test_batch_y.shape[0]:
            print("‚úÖ Batch sizes match!")
        else:
            print("‚ùå Batch size mismatch!")

        print(f"Model output sample: {test_output[0].cpu().numpy()}")
        print(f"Target sample: {test_batch_y[0].cpu().numpy()}")

    # Train model (reduced epochs for testing)
    print(f"\nüöÄ Starting training with {min(num_epochs, 10)} epochs...")
    train_losses, val_losses = train_model(
        model, train_loader, val_loader, min(num_epochs, 10), learning_rate
    )

    print("\n‚úÖ Training completed successfully!")

## 5. Training Visualization

In [None]:
# Plot training curves
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss (MSE)')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.plot(train_losses[-50:], label='Training Loss (Last 50 epochs)')
plt.plot(val_losses[-50:], label='Validation Loss (Last 50 epochs)')
plt.title('Training Convergence')
plt.xlabel('Epoch')
plt.ylabel('Loss (MSE)')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Log the plot to MLflow
plt.savefig('training_curves.png', dpi=150, bbox_inches='tight')
mlflow.log_artifact('training_curves.png')

## 6. Model Evaluation

In [None]:
def evaluate_model(model, data_loader, data_name="Test"):
    """
    Evaluate model performance
    """
    model.eval()
    predictions = []
    actuals = []

    with torch.no_grad():
        for batch_X, batch_y in data_loader:
            outputs = model(batch_X)
            predictions.extend(outputs.cpu().numpy())
            actuals.extend(batch_y.cpu().numpy())

    predictions = np.array(predictions)
    actuals = np.array(actuals)

    print(f"Debug - {data_name} shapes:")
    print(f"  Predictions shape: {predictions.shape}")
    print(f"  Actuals shape: {actuals.shape}")

    # Handle shape mismatch - predictions are (n_samples, 1) and actuals are (n_samples, 1, 1)
    if actuals.ndim == 3 and actuals.shape[2] == 1:
        actuals = actuals.squeeze(axis=2)  # Remove the last dimension: (n_samples, 1, 1) -> (n_samples, 1)

    # Now both should be (n_samples, 1), so we can flatten safely
    predictions_flat = predictions.flatten()
    actuals_flat = actuals.flatten()

    print("  After reshaping:")
    print(f"    Predictions flat shape: {predictions_flat.shape}")
    print(f"    Actuals flat shape: {actuals_flat.shape}")

    # Calculate metrics
    mse = mean_squared_error(actuals_flat, predictions_flat)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(actuals_flat, predictions_flat)

    # Calculate MAPE (Mean Absolute Percentage Error) - avoid division by zero
    non_zero_mask = actuals_flat != 0
    if np.any(non_zero_mask):
        mape = np.mean(np.abs((actuals_flat[non_zero_mask] - predictions_flat[non_zero_mask]) / actuals_flat[non_zero_mask])) * 100
    else:
        mape = float('inf')

    print(f"\n{data_name} Metrics:")
    print(f"  MSE: {mse:.6f}")
    print(f"  RMSE: {rmse:.6f}")
    print(f"  MAE: {mae:.6f}")
    print(f"  MAPE: {mape:.2f}%")

    return {
        'mse': mse,
        'rmse': rmse,
        'mae': mae,
        'mape': mape,
        'predictions': predictions,
        'actuals': actuals
    }

# Evaluate on all datasets
train_metrics = evaluate_model(model, train_loader, "Training")
val_metrics = evaluate_model(model, val_loader, "Validation")
test_metrics = evaluate_model(model, test_loader, "Test")

# Log metrics to MLflow
mlflow.log_metrics({
    "train_rmse": train_metrics['rmse'],
    "train_mae": train_metrics['mae'],
    "train_mape": train_metrics['mape'],
    "val_rmse": val_metrics['rmse'],
    "val_mae": val_metrics['mae'],
    "val_mape": val_metrics['mape'],
    "test_rmse": test_metrics['rmse'],
    "test_mae": test_metrics['mae'],
    "test_mape": test_metrics['mape']
})

## 7. Prediction Visualization

In [None]:
# Plot predictions vs actuals
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Prepare data for plotting - handle potential shape differences
test_actuals = test_metrics['actuals']
test_predictions = test_metrics['predictions']

# Ensure both are flattened for scatter plot
if test_actuals.ndim > 1:
    test_actuals_flat = test_actuals.flatten()
else:
    test_actuals_flat = test_actuals

if test_predictions.ndim > 1:
    test_predictions_flat = test_predictions.flatten()
else:
    test_predictions_flat = test_predictions

# Test set predictions scatter plot
axes[0, 0].scatter(test_actuals_flat, test_predictions_flat, alpha=0.6)
axes[0, 0].plot([test_actuals_flat.min(), test_actuals_flat.max()],
                [test_actuals_flat.min(), test_actuals_flat.max()], 'r--')
axes[0, 0].set_xlabel('Actual Values')
axes[0, 0].set_ylabel('Predicted Values')
axes[0, 0].set_title('Test Set: Predictions vs Actuals')
axes[0, 0].grid(True, alpha=0.3)

# Time series plot (first 100 test predictions)
n_plot = min(100, len(test_predictions))
# Ensure we get the right shape for plotting
if test_actuals.ndim == 2:
    actual_plot = test_actuals[:n_plot, 0] if test_actuals.shape[1] == 1 else test_actuals[:n_plot].flatten()[:n_plot]
else:
    actual_plot = test_actuals_flat[:n_plot]

if test_predictions.ndim == 2:
    pred_plot = test_predictions[:n_plot, 0] if test_predictions.shape[1] == 1 else test_predictions[:n_plot].flatten()[:n_plot]
else:
    pred_plot = test_predictions_flat[:n_plot]

axes[0, 1].plot(range(n_plot), actual_plot, label='Actual', linewidth=2)
axes[0, 1].plot(range(n_plot), pred_plot, label='Predicted', linewidth=2)
axes[0, 1].set_xlabel('Time Steps')
axes[0, 1].set_ylabel('Value')
axes[0, 1].set_title('Test Set: Time Series Comparison')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Residuals plot
residuals = test_actuals_flat - test_predictions_flat
axes[1, 0].scatter(test_predictions_flat, residuals, alpha=0.6)
axes[1, 0].axhline(y=0, color='r', linestyle='--')
axes[1, 0].set_xlabel('Predicted Values')
axes[1, 0].set_ylabel('Residuals')
axes[1, 0].set_title('Residuals Plot')
axes[1, 0].grid(True, alpha=0.3)

# Error distribution
axes[1, 1].hist(residuals, bins=50, alpha=0.7, edgecolor='black')
axes[1, 1].axvline(x=0, color='r', linestyle='--')
axes[1, 1].set_xlabel('Prediction Error')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].set_title('Error Distribution')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Save and log to MLflow
plt.savefig('model_evaluation.png', dpi=150, bbox_inches='tight')
mlflow.log_artifact('model_evaluation.png')

In [None]:
# Display final results summary
print("="*60)
print("MODEL TRAINING SUMMARY")
print("="*60)

print("\nüìä Final Test Set Performance:")
print(f"   RMSE: {test_metrics['rmse']:.4f}")
print(f"   MAE:  {test_metrics['mae']:.4f}")
print(f"   MAPE: {test_metrics['mape']:.2f}%")

print("\nüìà Model Architecture:")
print(f"   Input Features: {input_size}")
print(f"   Hidden Units: {hidden_size}")
print(f"   LSTM Layers: {num_layers}")
print(f"   Total Parameters: {sum(p.numel() for p in model.parameters()):,}")

print("\nüíæ Model Artifacts:")
print(f"   MLflow Run ID: {run.info.run_id}")
print("   Model Registry: lstm-time-series-forecasting")

print("\nüîß Next Steps:")
print("   1. Deploy model using Azure ML endpoints")
print("   2. Implement automated retraining pipelines")
print("   3. Add model monitoring and drift detection")
print("   4. Experiment with advanced architectures (Transformer, GRU)")

print("\nüåê MLflow UI:")
print("   Run: mlflow ui --backend-store-uri ./mlruns")
print("="*60)

## 8. Model Saving

In [None]:
# Save model
model_dir = Path('../outputs/models')
model_dir.mkdir(parents=True, exist_ok=True)

# Save PyTorch model
model_path = model_dir / 'lstm_model.pth'
torch.save({
    'model_state_dict': model.state_dict(),
    'model_config': {
        'input_size': input_size,
        'hidden_size': hidden_size,
        'num_layers': num_layers,
        'output_size': output_size,
        'dropout': dropout
    },
    'metrics': {
        'test_rmse': test_metrics['rmse'],
        'test_mae': test_metrics['mae'],
        'test_mape': test_metrics['mape']
    }
}, model_path)

print(f"Model saved to: {model_path}")

# Log model to MLflow
mlflow.pytorch.log_model(
    pytorch_model=model,
    artifact_path="lstm_model",
    registered_model_name="lstm-time-series-forecasting"
)

print(f"Model logged to MLflow with run ID: {run.info.run_id}")

## 9. Hyperparameter Tuning Example

In [None]:
# Hyperparameter tuning experiment
hyperparams_grid = {
    'hidden_size': [32, 64, 128],
    'num_layers': [1, 2, 3],
    'learning_rate': [0.001, 0.01, 0.1],
    'dropout': [0.1, 0.2, 0.3]
}

print("Hyperparameter Tuning Grid:")
for param, values in hyperparams_grid.items():
    print(f"  {param}: {values}")

print("\nüí° For comprehensive hyperparameter tuning, consider using:")
print("   - Azure ML Hyperdrive for automated hyperparameter tuning")
print("   - Optuna for advanced optimization")
print("   - Ray Tune for distributed hyperparameter tuning")
print("\nüìö Example implementation can be found in the training script:")
print("   src/training/train_lstm.py")

# Display final results summary


In [None]:
print("="*60)
print("MODEL TRAINING SUMMARY")
print("="*60)

print("\nüìä Final Test Set Performance:")
print(f"   RMSE: {test_metrics['rmse']:.4f}")
print(f"   MAE:  {test_metrics['mae']:.4f}")
print(f"   MAPE: {test_metrics['mape']:.2f}%")

print("\nüìà Model Architecture:")
print(f"   Input Features: {input_size}")
print(f"   Hidden Units: {hidden_size}")
print(f"   LSTM Layers: {num_layers}")
print(f"   Total Parameters: {sum(p.numel() for p in model.parameters()):,}")

print("\nüíæ Model Artifacts:")
print(f"   MLflow Run ID: {run.info.run_id}")
print("   Model Registry: lstm-time-series-forecasting")

print("\nüîß Next Steps:")
print("   1. Deploy model using Azure ML endpoints")
print("   2. Implement automated retraining pipelines")
print("   3. Add model monitoring and drift detection")
print("   4. Experiment with advanced architectures (Transformer, GRU)")

print("\nüåê MLflow UI:")
print("   Run: mlflow ui --backend-store-uri ./mlruns")
print("="*60)