In [None]:
import torch

def available_gpus():
    gpus = torch.cuda.device_count()
    return [torch.cuda.get_device_name(i) for i in range(gpus)]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("GPUs disponibles:", available_gpus())

## Libraries

In [2]:
import os
from datetime import datetime
from libraries.utils import read_csv
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm
import torch
import torch.nn as nn

from libraries.sequence_generators import sequence_generator_last_event
from models_architectures.AutoEnconderLastEvent import AutoEnconderLastEvent

## Parameters

In [3]:
MISSION = 2
PHASE = 4

WINDOW_SIZE = 100
PERCENTILE = 99
BATCH_SIZE = 1024
EPOCHS = 25
LEARNING_RATE = 0.0001

CHANNELS = ["allchannels", "subset", "target"][1]
FIRST_CHANNEL_NUMBER = 18  # Only if CHANNELS == "subset" 
LAST_CHANNEL_NUMBER = 28  # Only if CHANNELS == "subset"

# INPUT_DATA_PATH = f'../data/Mission2-Preprocessed/data_preprocessed_target_frequency-previous_2000_2003.csv'
INPUT_DATA_PATH = f'../data/Mission2-Preprocessed/data_preprocessed_channels18_28_frequency-previous_2000_2003.csv'

In [4]:
mission1_phases_dates = {
    "test_start_date": "2007-01-01T00:00:00",
    "test_end_date": "2014-01-01T00:00:00",

    "phase1_start_date_train": "2000-01-01T00:00:00",
    "phase1_end_date_train": "2000-03-11T00:00:00",
    "phase1_start_date_val": "2000-03-11T00:00:00",
    "phase1_end_date_val": "2000-04-01T00:00:00",
    
    "phase2_start_date_train": "2000-01-01T00:00:00",
    "phase2_end_date_train": "2000-09-01T00:00:00",
    "phase2_start_date_val": "2000-09-01T00:00:00",
    "phase2_end_date_val": "2000-11-01T00:00:00",
    
    "phase3_start_date_train": "2000-01-01T00:00:00",
    "phase3_end_date_train": "2001-07-01T00:00:00",
    "phase3_start_date_val": "2001-07-01T00:00:00",
    "phase3_end_date_val": "2001-11-01T00:00:00",
    
    "phase4_start_date_train": "2000-01-01T00:00:00",
    "phase4_end_date_train": "2003-04-01T00:00:00",
    "phase4_start_date_val": "2003-04-01T00:00:00",
    "phase4_end_date_val": "2003-07-01T00:00:00",
    
    "phase5_start_date_train": "2000-01-01T00:00:00",
    "phase5_end_date_train": "2006-10-01T00:00:00",
    "phase5_start_date_val": "2006-10-01T00:00:00",
    "phase5_end_date_val": "2007-01-01T00:00:00"
}

mission2_phases_dates = {
    "test_start_date": "2001-10-01T00:00:00",
    "test_end_date": "2003-07-01T00:00:00",

    "phase1_start_date_train": "2000-01-01T00:00:00",
    "phase1_end_date_train": "2000-01-24T00:00:00",
    "phase1_start_date_val": "2000-01-24T00:00:00",
    "phase1_end_date_val": "2000-02-01T00:00:00",
    
    "phase2_start_date_train": "2000-01-01T00:00:00",
    "phase2_end_date_train": "2000-05-01T00:00:00",
    "phase2_start_date_val": "2000-05-01T00:00:00",
    "phase2_end_date_val": "2000-06-01T00:00:00",
    
    "phase3_start_date_train": "2000-01-01T00:00:00",
    "phase3_end_date_train": "2000-09-01T00:00:00",
    "phase3_start_date_val": "2000-09-01T00:00:00",
    "phase3_end_date_val": "2000-11-01T00:00:00",
    
    "phase4_start_date_train": "2000-01-01T00:00:00",
    "phase4_end_date_train": "2001-07-01T00:00:00",
    "phase4_start_date_val": "2001-07-01T00:00:00",
    "phase4_end_date_val": "2001-10-01T00:00:00"
}

missions_phases_dates = {
    1: mission1_phases_dates,
    2: mission2_phases_dates
}

In [5]:
start_date_train = datetime.fromisoformat(missions_phases_dates[MISSION][f"phase{PHASE}_start_date_train"])
end_date_train = datetime.fromisoformat(missions_phases_dates[MISSION][f"phase{PHASE}_end_date_train"])
start_date_val = datetime.fromisoformat(missions_phases_dates[MISSION][f"phase{PHASE}_start_date_val"])
end_date_val = datetime.fromisoformat(missions_phases_dates[MISSION][f"phase{PHASE}_end_date_val"])

test_type = f"{CHANNELS}" if CHANNELS == "allchannels" or CHANNELS == "target" else f"Channels{FIRST_CHANNEL_NUMBER}-{LAST_CHANNEL_NUMBER}"
model_save_path = f"../models/AutoEnconderLastEvent/Phase{PHASE}_{test_type}_window{WINDOW_SIZE}_percentile_{PERCENTILE}_epochs{EPOCHS}_lr{LEARNING_RATE}.pth"

## Read and preprocess data

### a. Read data

In [None]:
data = read_csv(INPUT_DATA_PATH, sep=";")

# Filtrar los datos entre start_date_train y end_date_train
train_data = data.loc[(data.index >= start_date_train) & (data.index < end_date_train)]
val_data = data.loc[(data.index >= start_date_val) & (data.index < end_date_val)]
train_data

### b. Normalización o estandarización

In [None]:
# Normalizar los datos entre 0 y 1
scaler = MinMaxScaler()
train_data_normalized = scaler.fit_transform(train_data)
train_df_normalized = pd.DataFrame(train_data_normalized, index=train_data.index, columns=train_data.columns)
val_data_normalized = scaler.transform(val_data)
val_df_normalized = pd.DataFrame(val_data_normalized, index=val_data.index, columns=val_data.columns)
train_df_normalized

## Arquitectura del modelo

### a. Transformer Autoencoder

In [8]:
# Dimensiones del modelo
sequence_length = WINDOW_SIZE
n_features = train_df_normalized.shape[1]
latent_dim = 8

# Crear modelo
autoencoder = AutoEnconderLastEvent(sequence_length, n_features, latent_dim).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(autoencoder.parameters(), lr=LEARNING_RATE)

## Entrenamiento del modelo

### a. Train

In [None]:
steps_per_epoch_train = (len(train_df_normalized) - WINDOW_SIZE) // BATCH_SIZE

def train_autoencoder(train_generator, val_generator, model, criterion, optimizer, epochs, steps_per_epoch_train, steps_per_epoch_val, device):
    train_loss_history = []
    val_loss_history = []

    model.train()
    for epoch in range(epochs):
        total_train_loss = 0  # Initialize total training loss for the epoch

        # Training loop
        for step in range(steps_per_epoch_train):
            inputs, targets = next(train_generator)
            inputs, targets = inputs.to(device), targets.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)

            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()  # Accumulate training loss

        avg_train_loss = total_train_loss / steps_per_epoch_train  # Calculate average training loss
        train_loss_history.append(avg_train_loss)

        # Validation loop (if val_generator is provided)
        if val_generator is not None:
            model.eval()  # Set the model to evaluation mode
            total_val_loss = 0

            with torch.no_grad():
                for step in range(steps_per_epoch_val):
                    val_inputs, val_targets = next(val_generator)
                    val_inputs, val_targets = inputs.to(device), targets.to(device)

                    val_outputs = model(val_inputs)
                    val_loss = criterion(val_outputs, val_targets)

                    total_val_loss += val_loss.item()

            avg_val_loss = total_val_loss / steps_per_epoch_val  # Calculate average validation loss
            val_loss_history.append(avg_val_loss)
            print(f"Epoch [{epoch + 1}/{epochs}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

        else:
            print(f"Epoch [{epoch + 1}/{epochs}], Train Loss: {avg_train_loss:.4f}")

    return train_loss_history, val_loss_history


# Crea el generador de secuencias
train_gen = sequence_generator_last_event(train_df_normalized.values, WINDOW_SIZE, BATCH_SIZE)
val_gen = sequence_generator_last_event(val_df_normalized.values, WINDOW_SIZE, BATCH_SIZE)
steps_per_epoch_val = (len(val_df_normalized) - WINDOW_SIZE) // BATCH_SIZE
# Entrenar el modelo
train_loss_history, val_loss_history = train_autoencoder(train_gen, val_gen, autoencoder, criterion, optimizer, epochs=EPOCHS,
                  steps_per_epoch_train=steps_per_epoch_train, steps_per_epoch_val=steps_per_epoch_val, device=device)

In [None]:
import matplotlib.pyplot as plt

def plot_loss_history(train_loss_history, val_loss_history=None, save_path=None):
    """
    Plots the training and validation loss history.

    Args:
        train_loss_history (list): List of training loss values per epoch.
        val_loss_history (list, optional): List of validation loss values per epoch. Default is None.
    """
    epochs = range(1, len(train_loss_history) + 1)

    plt.figure(figsize=(10, 6))
    plt.plot(epochs, train_loss_history, label='Training Loss', marker='o', color='blue')
    
    if val_loss_history is not None and len(val_loss_history) > 0:
        plt.plot(epochs, val_loss_history, label='Validation Loss', marker='o', color='orange')

    plt.title('Training and Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.grid(True, linestyle='--', alpha=0.6)
    plt.legend()
    plt.tight_layout()

    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.show()

plot_save_path = "../graphics/AutoEnconderLastEvent_" + os.path.splitext(os.path.basename(model_save_path))[0] + ".png"
plot_loss_history(train_loss_history, val_loss_history, save_path=plot_save_path)

### b. Calcular threshold

In [11]:
# ### THRESHOLD GENERAL
# def calculate_general_threshold(percentile, autoencoder, df_normalized, window_size, batch_size, device):
#     steps_per_epoch = (len(df_normalized) - window_size) // batch_size

#     # Umbral basado en el percentil 95 del error
#     train_gen = sequence_generator_last_event(df_normalized.values, window_size, batch_size)
#     reconstruction_errors = []

#     # Barra de progreso para el cálculo de reconstruction_errors
#     for _ in tqdm(range(steps_per_epoch), desc="Calculando errores de reconstrucción"):
#         batch_inputs, batch_targets = next(train_gen)
#         reconstructed_batch = autoencoder(batch_inputs.to(device)).detach().cpu().numpy()
#         batch_targets = batch_targets.cpu().numpy()
#         reconstruction_errors.extend(
#             np.mean(np.square(batch_targets - reconstructed_batch), axis=(0))
#         )

#     return np.percentile(reconstruction_errors, percentile)

# threshold = calculate_general_threshold(PERCENTILE, autoencoder, train_df_normalized, WINDOW_SIZE, BATCH_SIZE, device)
# threshold

In [12]:
# ### THRESHOLD PERCENTILE
# def calculate_channels_thresholds(percentile, autoencoder, df_normalized, window_size, batch_size, device):
#     steps_per_epoch = (len(df_normalized) - window_size) // batch_size

#     # Umbral basado en el percentil 95 del error
#     train_gen = sequence_generator_last_event(df_normalized.values, window_size, batch_size)
#     reconstruction_errors = []

#     # Barra de progreso para el cálculo de reconstruction_errors
#     for _ in tqdm(range(steps_per_epoch), desc="Calculando errores de reconstrucción"):
#         batch_inputs, batch_targets = next(train_gen)  # Obtener entradas y objetivos
#         batch_inputs = batch_inputs.to(device)

#         # Reconstruir el batch
#         reconstructed_batch = autoencoder(batch_inputs).detach().cpu().numpy()  # Salidas reconstruidas
#         batch_targets = batch_targets.cpu().numpy()

#         errors = np.square(batch_targets - reconstructed_batch)  # Error por canal
#         reconstruction_errors.extend(errors)

#     return list(np.percentile(reconstruction_errors, percentile, axis=0))

# threshold_list = calculate_channels_thresholds(PERCENTILE, autoencoder, train_df_normalized, WINDOW_SIZE, BATCH_SIZE, device)
# threshold_list

In [None]:
### THRESHOLD IPR
def anomaly_limits(errors, percentile, axis) -> float:
    P1 = np.percentile(errors, 100-percentile, axis)
    P2 = np.percentile(errors, percentile, axis)
    IPR = P2 - P1
    return list(P2 + 1.5 * IPR)

def calculate_channels_thresholds(percentile, autoencoder, df_normalized, window_size, batch_size, device):
    steps_per_epoch = (len(df_normalized) - window_size) // batch_size

    # Umbral basado en el percentil 95 del error
    train_gen = sequence_generator_last_event(df_normalized.values, window_size, batch_size)
    reconstruction_errors = []

    # Barra de progreso para el cálculo de reconstruction_errors
    for _ in tqdm(range(steps_per_epoch), desc="Calculando errores de reconstrucción"):
        batch_inputs, batch_targets = next(train_gen)
        batch_inputs = batch_inputs.to(device)

        reconstructed_batch = autoencoder(batch_inputs).detach().cpu().numpy()
        batch_targets = batch_targets.cpu().numpy()

        reconstruction_errors.extend(
            # np.mean(np.square(batch - reconstructed_batch), axis=(1, 2))
            np.square(batch_targets - reconstructed_batch)
        )
    return anomaly_limits(reconstruction_errors, percentile, axis=(0))

threshold_list = calculate_channels_thresholds(PERCENTILE, autoencoder, train_df_normalized, WINDOW_SIZE, BATCH_SIZE, device)
threshold_list

### c. Save model

In [14]:
torch.save({
    'model': autoencoder,
    'threshold': threshold_list,
    'scaler': scaler,
}, model_save_path)