In [None]:
import os
import glob
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import time
import random

# Function to set seeds for reproducibility
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

num_synthetic_flights_list = [7, 15, 30, 45, 60, 150]

for num_synthetic_flights in num_synthetic_flights_list:

    # Function to load and process flight data
    def load_and_process_flight_data(file_paths):
        flights = []
        for file_path in file_paths:
            df = pd.read_csv(file_path, delimiter='\t')
            if 'Time(milli)' in df.columns:
                df = df.drop(columns=['Time(milli)'])
            flights.append(df)
        return flights

    # Function to create state-action pairs for training
    def create_state_action_pairs(flight_data):
        states = []
        actions = []
        for df in flight_data:
            state = df[['ALT(m)', 'Phi(deg)', 'Theta(deg)', 'Psi(deg)',
                         'Vx(m/s)', 'Vy(m/s)', 'Vz(m/s)', 'P(deg/s)', 'Q(deg/s)', 'R(deg/s)',
                         'Nx(m/s2)', 'Ny(m/s2)', 'Nz(m/s2)', 'Radial(deg)', 'Distance(m)', 'DeltaAlt:Anv-Tgt(m)']].values
            action = df[['JX', 'JY', 'Throttle']].values
            states.append(state)
            actions.append(action)
        return states, actions  # Return lists of arrays

    # Function to plot actual vs predicted actions with mean and standard deviation
    def plot_trajectory_with_mean_std(actual_actions_list, predicted_actions_list, plots_dir):
        actual_actions = np.array(actual_actions_list)
        predicted_actions = np.array(predicted_actions_list)

        # Adjust time steps to handle different lengths
        max_time_steps = max([actions.shape[0] for actions in actual_actions_list])
        time_steps = np.arange(max_time_steps)

        # Pad sequences to the same length
        actual_padded = np.array([np.pad(actions, ((0, max_time_steps - actions.shape[0]), (0, 0)), 'edge') for actions in actual_actions_list])
        predicted_padded = np.array([np.pad(actions, ((0, max_time_steps - actions.shape[0]), (0, 0)), 'edge') for actions in predicted_actions_list])

        # Calculate mean and std for actual and predicted actions
        actual_mean = np.mean(actual_padded, axis=0)
        actual_std = np.std(actual_padded, axis=0)
        predicted_mean = np.mean(predicted_padded, axis=0)
        predicted_std = np.std(predicted_padded, axis=0)

        plt.figure(figsize=(15, 5))

        # Plot JX
        plt.subplot(1, 3, 1)
        plt.plot(time_steps, actual_mean[:, 0], label='Actual JX (Mean)', color='blue')
        plt.fill_between(time_steps, actual_mean[:, 0] - actual_std[:, 0], actual_mean[:, 0] + actual_std[:, 0], color='blue', alpha=0.2, label='Actual JX (± Std)')
        plt.plot(time_steps, predicted_mean[:, 0], label='Predicted JX (Mean)', linestyle='--', color='red')
        plt.fill_between(time_steps, predicted_mean[:, 0] - predicted_std[:, 0], predicted_mean[:, 0] + predicted_std[:, 0], color='red', alpha=0.2, label='Predicted JX (± Std)')
        plt.xlabel('Time Step')
        plt.ylabel('JX')
        plt.legend(loc='lower center', fontsize='x-small', ncol=2, handlelength=2.5, handletextpad=1.5)

        # Plot JY
        plt.subplot(1, 3, 2)
        plt.plot(time_steps, actual_mean[:, 1], label='Actual JY (Mean)', color='blue')
        plt.fill_between(time_steps, actual_mean[:, 1] - actual_std[:, 1], actual_mean[:, 1] + actual_std[:, 1], color='blue', alpha=0.2, label='Actual JY (± Std)')
        plt.plot(time_steps, predicted_mean[:, 1], label='Predicted JY (Mean)', linestyle='--', color='red')
        plt.fill_between(time_steps, predicted_mean[:, 1] - predicted_std[:, 1], predicted_mean[:, 1] + predicted_std[:, 1], color='red', alpha=0.2, label='Predicted JY (± Std)')
        plt.xlabel('Time Step')
        plt.ylabel('JY')
        plt.legend(loc='lower center', fontsize='x-small', ncol=2, handlelength=2.5, handletextpad=1.5)

        # Plot Throttle
        plt.subplot(1, 3, 3)
        plt.plot(time_steps, actual_mean[:, 2], label='Actual Throttle (Mean)', color='blue')
        plt.fill_between(time_steps, actual_mean[:, 2] - actual_std[:, 2], actual_mean[:, 2] + actual_std[:, 2], color='blue', alpha=0.2, label='Actual Throttle (± Std)')
        plt.plot(time_steps, predicted_mean[:, 2], label='Predicted Throttle (Mean)', linestyle='--', color='red')
        plt.fill_between(time_steps, predicted_mean[:, 2] - predicted_std[:, 2], predicted_mean[:, 2] + predicted_std[:, 2], color='red', alpha=0.2, label='Predicted Throttle (± Std)')
        plt.xlabel('Time Step')
        plt.ylabel('Throttle')
        plt.legend(loc='lower center', fontsize='x-small', ncol=2, handlelength=2.5, handletextpad=1.5)

        plt.suptitle('Trajectory Comparison: Actual vs Predicted with Mean and Standard Deviation')
        plt.tight_layout()
        plt.savefig(os.path.join(plots_dir, 'trajectory_comparison_mean_std.png'), format='png', dpi=500)
        plt.close()

    # Directory containing the adjusted flight data files
    adjusted_data_directory = './data/adjusted_flights/'
    adjusted_file_pattern = os.path.join(adjusted_data_directory, 'SimuladorDeVoo_*.txt')
    adjusted_files = glob.glob(adjusted_file_pattern)

    # Load and process the adjusted flight data
    flight_data_adjusted = load_and_process_flight_data(adjusted_files)

    # Split adjusted flights into 70% training/validation and 30% test
    flight_data_train_val_adjusted, flight_data_test = train_test_split(flight_data_adjusted, test_size=0.30, random_state=42)

    # Directory containing the generated flight data files
    generated_data_directory = f'./data/smoothed_flights_{num_synthetic_flights}/'
    generated_file_pattern = os.path.join(generated_data_directory, 'GeneratedFlight_*.txt')
    generated_files = glob.glob(generated_file_pattern)

    # Load and process the generated flight data
    flight_data_generated = load_and_process_flight_data(generated_files)

    # Combine the 80% adjusted training/validation data with all generated data
    flight_data_train_val = flight_data_train_val_adjusted + flight_data_generated

    print(f'Training/validation set has {len(flight_data_train_val)} samples.')
    print(f'Test set has {len(flight_data_test)} samples from adjusted flights only.')

    # Create state-action pairs for training/validation and testing
    states_train_val_list, actions_train_val_list = create_state_action_pairs(flight_data_train_val)
    states_test_list, actions_test_list = create_state_action_pairs(flight_data_test)

    # Concatenate all flights data for training/validation
    states_train_val = np.concatenate(states_train_val_list, axis=0)
    actions_train_val = np.concatenate(actions_train_val_list, axis=0)

    # Concatenate all flights data for testing
    states_test = np.concatenate(states_test_list, axis=0)
    actions_test = np.concatenate(actions_test_list, axis=0)

    # Normalize the data using training data mean and std
    state_mean = np.mean(states_train_val, axis=0)
    state_std = np.std(states_train_val, axis=0)
    states_train_val = (states_train_val - state_mean) / state_std

    # Normalize test data using the same mean and std
    states_test = (states_test - state_mean) / state_std

    # Set random seed for cross-validation
    cv_seed = 42
    set_seed(cv_seed)

    # K-Fold Cross Validation on training/validation data
    kf = KFold(n_splits=5, shuffle=True, random_state=cv_seed)

    fold_no = 1
    metrics_cv_list = []
    epochs_per_fold = []

    # Directories to save metrics, models, and plots
    metrics_dir = f'./metrics/mlp-full-augmented_{num_synthetic_flights}/'
    plots_dir = f'./plots/mlp-full-augmented_{num_synthetic_flights}/'
    models_dir = f'./models/mlp-full-augmented_{num_synthetic_flights}/'
    os.makedirs(metrics_dir, exist_ok=True)
    os.makedirs(plots_dir, exist_ok=True)
    os.makedirs(models_dir, exist_ok=True)

    for train_index, val_index in kf.split(states_train_val):
        X_train, X_val = states_train_val[train_index], states_train_val[val_index]
        y_train, y_val = actions_train_val[train_index], actions_train_val[val_index]

        # Define the Feedforward Neural Network model
        model = Sequential([
            Dense(128, input_dim=X_train.shape[1], activation='relu'),
            Dense(64, activation='relu'),
            Dense(32, activation='relu'),
            Dense(y_train.shape[1])  # Output layer
        ])

        # Compile the model
        optimizer = Adam(learning_rate=0.00001)
        model.compile(optimizer=optimizer, loss='mse')

        # Define early stopping
        early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)

        # Measure training time
        train_start_time = time.time()

        # Train the model with validation and early stopping
        history = model.fit(X_train, y_train, epochs=int(1e6), batch_size=32,
                            validation_data=(X_val, y_val), callbacks=[early_stopping], verbose=0)

        train_end_time = time.time()
        training_time = train_end_time - train_start_time

        # Record the number of epochs used before early stopping
        epochs_per_fold.append(len(history.history['loss']))

        # Measure inference time on validation data
        inference_start_time = time.time()

        # Evaluate the model on the validation set
        val_loss = model.evaluate(X_val, y_val, verbose=0)

        # Predict actions on the validation set
        predicted_actions = model.predict(X_val, verbose=0)

        inference_end_time = time.time()
        inference_time = inference_end_time - inference_start_time

        print(f'Fold {fold_no} - Training Time: {training_time:.4f} seconds, Inference Time: {inference_time:.4f} seconds')

        # Calculate MSE, RMSE, MAE, and R² on validation data
        mse = mean_squared_error(y_val, predicted_actions)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_val, predicted_actions)
        r2 = r2_score(y_val, predicted_actions)

        # Append metrics and timing to list
        metrics_cv_list.append({
            'fold': fold_no,
            'loss': val_loss,
            'mse': mse,
            'rmse': rmse,
            'mae': mae,
            'r2': r2,
            'training_time': training_time,
            'inference_time': inference_time
        })

        print(f'Fold {fold_no} - Validation MSE: {mse}')
        print(f'Fold {fold_no} - Validation RMSE: {rmse}')
        print(f'Fold {fold_no} - Validation MAE: {mae}')
        print(f'Fold {fold_no} - Validation R²: {r2}')

        # Plot training & validation loss values
        plt.figure(figsize=(10, 5))
        plt.plot(history.history['loss'], label='Train Loss')
        plt.plot(history.history['val_loss'], label='Validation Loss')
        plt.title(f'Model Loss - Fold {fold_no}')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend(loc='upper right')
        plt.savefig(os.path.join(plots_dir, f'loss_fold_{fold_no}.png'))
        plt.close()

        # Save the model using the native Keras format
        model.save(os.path.join(models_dir, f'model_fold_{fold_no}.keras'))

        fold_no += 1

    # Calculate the average number of epochs from cross-validation
    avg_epochs = int(np.mean(epochs_per_fold))
    print(f'Average number of epochs from cross-validation: {avg_epochs}')

    # Convert metrics_cv_list to DataFrame
    metrics_cv_df = pd.DataFrame(metrics_cv_list)

    # Calculate mean and std for each metric
    mean_metrics_cv = metrics_cv_df.mean(numeric_only=True)
    std_metrics_cv = metrics_cv_df.std(numeric_only=True)

    # Add the fold column for mean and std
    mean_metrics_cv['fold'] = 'mean'
    std_metrics_cv['fold'] = 'std'

    # Convert mean and std to DataFrame
    mean_metrics_cv_df = pd.DataFrame([mean_metrics_cv])
    std_metrics_cv_df = pd.DataFrame([std_metrics_cv])

    # Append mean and std rows to the metrics_cv_df
    metrics_cv_df = pd.concat([metrics_cv_df, mean_metrics_cv_df, std_metrics_cv_df], ignore_index=True)

    # Save cross-validation metrics DataFrame to CSV
    metrics_cv_df.to_csv(os.path.join(metrics_dir, 'cross_validation_metrics.csv'), index=False)

    # Now, train the final model using different seeds
    final_model_metrics_list = []

    seeds = [43, 44, 45, 46, 47]

    for seed in seeds:
        set_seed(seed)

        # Train the final model using the average number of epochs on the full training/validation data
        final_model = Sequential([
            Dense(128, input_dim=states_train_val.shape[1], activation='relu'),
            Dense(64, activation='relu'),
            Dense(32, activation='relu'),
            Dense(actions_train_val.shape[1])  # Output layer
        ])

        # Compile the final model
        final_model.compile(optimizer=Adam(learning_rate=0.00001), loss='mse')

        # Measure training time for the final model
        train_start_time = time.time()

        # Train the final model without early stopping
        final_model.fit(states_train_val, actions_train_val, epochs=avg_epochs, batch_size=32, verbose=0)

        train_end_time = time.time()
        final_training_time = train_end_time - train_start_time

        print(f'Final Model (Seed {seed}) - Training Time: {final_training_time:.4f} seconds')

        # Measure inference time on test data for the final model
        inference_start_time = time.time()

        # Evaluate the final model on the test set
        final_loss = final_model.evaluate(states_test, actions_test, verbose=0)

        # Predict actions on the test set using the final model
        final_predicted_actions = final_model.predict(states_test, verbose=0)

        inference_end_time = time.time()
        final_inference_time = inference_end_time - inference_start_time

        print(f'Final Model (Seed {seed}) - Inference Time: {final_inference_time:.4f} seconds')

        # Calculate metrics for the final model on test data
        final_mse = mean_squared_error(actions_test, final_predicted_actions)
        final_rmse = np.sqrt(final_mse)
        final_mae = mean_absolute_error(actions_test, final_predicted_actions)
        final_r2 = r2_score(actions_test, final_predicted_actions)

        # Append final model metrics and timing to list
        final_model_metrics_list.append({
            'seed': seed,
            'loss': final_loss,
            'mse': final_mse,
            'rmse': final_rmse,
            'mae': final_mae,
            'r2': final_r2,
            'training_time': final_training_time,
            'inference_time': final_inference_time
        })

        print(f'Final Model (Seed {seed}) Metrics:')
        print(f'Test MSE: {final_mse}')
        print(f'Test RMSE: {final_rmse}')
        print(f'Test MAE: {final_mae}')
        print(f'Test R²: {final_r2}')

        # Save the final model
        final_model.save(os.path.join(models_dir, f'final_model_seed_{seed}.keras'))

    # After the loop over seeds, process the final model metrics
    final_model_metrics_df = pd.DataFrame(final_model_metrics_list)

    # Calculate mean and std for each metric
    mean_final_model_metrics = final_model_metrics_df.mean(numeric_only=True)
    std_final_model_metrics = final_model_metrics_df.std(numeric_only=True)

    # Add the seed column for mean and std
    mean_final_model_metrics['seed'] = 'mean'
    std_final_model_metrics['seed'] = 'std'

    # Convert mean and std to DataFrame
    mean_final_model_metrics_df = pd.DataFrame([mean_final_model_metrics])
    std_final_model_metrics_df = pd.DataFrame([std_final_model_metrics])

    # Append mean and std rows to the final_model_metrics_df
    final_model_metrics_df = pd.concat([final_model_metrics_df, mean_final_model_metrics_df, std_final_model_metrics_df], ignore_index=True)

    # Save final model metrics DataFrame to CSV
    final_model_metrics_df.to_csv(os.path.join(metrics_dir, 'final_model_metrics.csv'), index=False)

    # Optionally, plot the trajectory comparison using one of the final models (e.g., the last one)
    print("Predicting the test dataset using the final model...")

    actual_actions_list = []
    predicted_actions_list = []

    for i, state_sequence in enumerate(states_test_list):
        # Normalize the state sequence
        state_sequence_normalized = (state_sequence - state_mean) / state_std

        # Predict actions for the entire sequence
        predicted_actions_full = final_model.predict(state_sequence_normalized, verbose=0)
        actual_actions_full = actions_test_list[i]

        actual_actions_list.append(actual_actions_full)
        predicted_actions_list.append(predicted_actions_full)
        print(f'Full flight trajectory {i+1} predicted and collected.')

    # Plot the mean and standard deviation for the collected trajectories
    plot_trajectory_with_mean_std(actual_actions_list, predicted_actions_list, plots_dir)
    print('Trajectory comparison plot with mean and standard deviation saved.')

2024-11-16 12:28:05.761470: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-16 12:28:05.827125: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-16 12:28:05.828445: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Training/validation set has 28 samples.
Test set has 9 samples from adjusted flights only.
Fold 1 - Training Time: 205.3280 seconds, Inference Time: 57.6729 seconds
Fold 1 - Validation MSE: 3.7243133016479475
Fold 1 - Validation RMSE: 1.9298479996227547
Fold 1 - Validation MAE: 0.9496515368296091
Fold 1 - Validation R²: 0.22875272535712746
Fold 2 - Training Time: 296.2161 seconds, Inference Time: 0.1695 seconds
Fold 2 - Validation MSE: 4.7853635329398
Fold 2 - Validation RMSE: 2.1875473784445902
Fold 2 - Validation MAE: 0.9318995651573326
Fold 2 - Validation R²: 0.6373391259515919
Fold 3 - Training Time: 242.7565 seconds, Inference Time: 0.2545 seconds
Fold 3 - Validation MSE: 6.180376346997661
Fold 3 - Validation RMSE: 2.4860362722610585
Fold 3 - Validation MAE: 1.0274360901756796
Fold 3 - Validation R²: 0.571837339394993
Fold 4 - Training Time: 300.6567 seconds, Inference Time: 0.1744 seconds
Fold 4 - Validation MSE: 4.499892794487324
Fold 4 - Validation RMSE: 2.12129507482748
Fold 4