In [None]:
import pandas as pd
from src.GLOBAL_VARS import NotTrustedPlants

model_dir = 'model'

df = pd.read_csv(f'{model_dir}/all_errors.csv')

df = df[df['forecast_horizon'] == 24]
# df = df.drop(columns=['MAPE']) # used if horizon==1
df = df.dropna(axis=1, how='all')

wf_id_list = df[df['wf_id'].isin(NotTrustedPlants)]['wf_id'].unique()
print(wf_id_list)

df = df[~df['wf_id'].isin(NotTrustedPlants)]
df = df[~df['wf_id'].isin([47, 49, 89, 154, 196])]


df = df[(df['time_steps'] == 1) |
        (df['time_steps'] == 2) |
        (df['time_steps'] == 4) |
        (df['time_steps'] == 8)
]
df = df[(df['number_of_cells'] == ("[64, 32, 16]")) |
        (df['number_of_cells'] == ("[128, 64, 32]")) |
        (df['number_of_cells'] == ("[256, 128, 64]")) |
        (df['number_of_cells'] == ("[512, 256, 128]"))
]
df = df[(df['batch_size'] == 32) |
        (df['batch_size'] == 64) |
        (df['batch_size'] == 128)
]
df = df[df['epochs'] == 100]
df = df[(df['learning_rate'] == 0.001) |
        (df['learning_rate'] == 0.01)
]
df = df[df['dropout_rate'] == 0.2]

selected_hyperparameters = df
selected_hyperparameters

In [None]:
best_r2 = pd.DataFrame()
# for every wf_id in the df, get the top 100 best r^2 values
for wf_id in selected_hyperparameters['wf_id'].unique():
    # get the top 100 best r^2 values
    df_wf_id = df[df['wf_id'] == wf_id]
    df_wf_id = df_wf_id.sort_values(by=['nMAE_overall'], ascending=True) # NOT R2 (asc=false) RIGHT NOW!!!!!!!!!!!!!!!
    df_wf_id = df_wf_id.head(5)

    best_r2 = pd.concat([best_r2, df_wf_id], ignore_index=True)

#best_r2 = best_r2[best_r2['wf_id'] != 154]
best_r2.to_csv('{}/all_errors_bests.csv'.format(model_dir), index=False)
best_r2

In [None]:
import pandas as pd

def find_hyperparameters_by_farm_count(df):
    """
    Finds unique hyperparameter combinations from the input DataFrame and
    counts how many distinct wind farms (wf_id) each combination is
    associated with. The results are sorted by this count in descending order.

    It is assumed that the input DataFrame 'df' only contains hyperparameter
    combinations that have already yielded "pretty good" results for each
    respective wind farm.
    """
    hyperparameter_cols = [
        'time_steps',
        'learning_rate',
        'epochs',
        'batch_size',
        'number_of_cells',
        'dropout_rate'
    ]

    # --- Input Validation ---
    if not isinstance(df, pd.DataFrame):
        print("Error: Input must be a pandas DataFrame.")
        return []
    if df.empty:
        print("Input DataFrame is empty. No analysis can be performed.")
        return []

    required_cols = hyperparameter_cols + ['wf_id']
    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols:
        print(f"Error: DataFrame is missing required columns: {', '.join(missing_cols)}")
        return []

    # --- Core Logic ---
    try:
        # Group by the hyperparameter columns and for each group, count the number of unique wf_ids.
        # This tells us for each unique hyperparameter combination, how many distinct wind farms it was used for.
        farm_counts_per_combo = df.groupby(hyperparameter_cols)['wf_id'].nunique()

        # Sort these combinations by the number of wind farms (the counts) in descending order.
        sorted_farm_counts = farm_counts_per_combo.sort_values(ascending=False)

        # Convert the sorted Series into a list of tuples for easier handling and presentation.
        # Each item in the list will be: ( (hyperparam_val1, hp_val2, ...), farm_count )
        result_list = []
        for hyperparam_tuple, count in sorted_farm_counts.items():
            result_list.append((hyperparam_tuple, count))

        return result_list

    except Exception as e:
        print(f"An error occurred during analysis: {e}")
        return []

best_r2_df = best_r2

print("--- Analyzing best_r2_df ---")
ranked_hyperparameters = find_hyperparameters_by_farm_count(best_r2_df)

if ranked_hyperparameters:
    print("\nHyperparameter combinations sorted by the number of wind farms they apply to (most farms first):")
    param_names = [
        'Time Steps', 'Learning Rate', 'Epochs', 'Batch Size',
        'Number of Cells', 'Dropout Rate'
    ]
    for hp_tuple, count in ranked_hyperparameters:
        print(f"\nShared by {count} wind farm(s):")
        for name, value in zip(param_names, hp_tuple):
            print(f"  {name}: {value}")
elif not best_r2_df.empty: # If list is empty but df wasn't, an error message was already printed
    print("No results to display. Check for earlier error messages if data was provided.")
# If best_r2_df was empty, a message is already printed by the function.

# total 15 left

In [None]:
import pandas as pd
import ast

def create_file_name(row):
    """
    Creates a standardized file name from a row of the errors DataFrame based on
    the specified project naming convention.

    This function correctly parses list-like strings for 'number_of_cells'
    and 'variables' into the required format.

    Args:
        row (pd.Series): A row from the errors DataFrame.

    Returns:
        str: The generated file name string.
    """
    # --- Handle 'number_of_cells' ---
    # Parses string like '[64, 32, 16]' into '64_32_16'
    try:
        cells_list = ast.literal_eval(str(row['number_of_cells']))
        cells_str = "_".join(map(str, cells_list))
    except (ValueError, SyntaxError):
        # Fallback for any variations in format
        cells_str = str(row['number_of_cells']).strip("[]").replace(", ", "_").replace(",", "_")

    # --- Handle 'variables' ---
    # Parses string like "['u100', 'v100']" into 'u100-v100'
    try:
        vars_list = ast.literal_eval(str(row['variables']))
        vars_str = "-".join(vars_list)
    except (ValueError, SyntaxError):
        # Fallback for unexpected formats
        vars_str = str(row['variables']).strip("[]'").replace("', '", "-").replace("'", "")

    # --- Construct the final file name ---
    file_name = (
        f"LSTM_wf{row['wf_id']}"
        f"_start{row['start_date']}"
        f"_end{row['end_date']}"
        f"_filt{row['filter_data']}"
        f"_lag{row['production_lag']}"
        f"_steps{row['time_steps']}"
        f"_bs{row['batch_size']}"
        f"_ep{row['epochs']}"
        f"_do{row['dropout_rate']}"
        f"_lr{row['learning_rate']}"
        f"_cells[{cells_str}]"
        f"_vars[{vars_str}]"
        f"_dense{row['dense']}"
    )

    return file_name

In [None]:
# get the data for each wind farm with the following parameters
#   Time Steps: 4
#   Learning Rate: 0.001
#   Epochs: 100
#   Batch Size: 128
#   Number of Cells: [256, 128, 64]
#   Dropout Rate: 0.2

import pandas as pd
from src.GLOBAL_VARS import NotTrustedPlants

model_dir = 'model'

df = pd.read_csv(f'{model_dir}/all_errors.csv')

df = df[df['forecast_horizon'] == 24]
# df = df.drop(columns=['MAPE']) # used if horizon==1
df = df.dropna(axis=1, how='all')


df = df[~df['wf_id'].isin(NotTrustedPlants)]
df = df[~df['wf_id'].isin([45, 47, 49, 89, 154, 196])]


df = df[((df['time_steps'] == 4))]
df = df[(df['number_of_cells'] == ("[256, 128, 64]"))]
df = df[(df['batch_size'] == 128)]
df = df[df['epochs'] == 100]
df = df[(df['learning_rate'] == 0.001)]
df = df[df['dropout_rate'] == 0.2]

best_config_models = df
best_config_models


In [None]:
import pandas as pd
import os
import ast
import re
import matplotlib.pyplot as plt
import numpy as np

# --- 1. Load Data and Define Models to Analyze ---
# This cell assumes the 'best_r2' DataFrame has been created in a previous cell.
best_r2_df = best_config_models

# --- 2. Iterate Through Models and Collect Metrics ---
# NOTE: The path is hardcoded as in your example.
model_dir = '/mnt/chromeos/GoogleDrive/MyDrive/Education/University/Non-ESE Courses/Misc./Wind Turbines Project/WFP/src/ThesisProject/LSTM/model'

# Define all possible horizons to initialize the metrics dictionary
metrics_by_horizon = {h: {'nMAE': [], 'nRMSE': [], 'R^2': []} for h in range(1, 25)}

for _, row in best_r2_df.iterrows():
    # create_file_name() is assumed to be defined in a previous cell
    folder_name = create_file_name(row)
    model_folder_path = os.path.join(model_dir, folder_name)

    # A single model run produces one error file containing all horizons.
    # The filename contains the max horizon, which is in the 'forecast_horizon' column.
    error_file_path = os.path.join(model_folder_path, f"errors_{row['prediction_method']}_{row['forecast_horizon']}.txt")

    if os.path.exists(error_file_path):
        try:
            with open(error_file_path, 'r') as f:
                lines = f.readlines()

            metric_key_map = {
                'nMAE_capacity': 'nMAE',
                'nRMSE_capacity': 'nRMSE',
                'R^2': 'R^2'
            }

            for line in lines:
                # CORRECTED REGEX: Removed the erroneous backslash from `\e`.
                match = re.search(r"^(nMAE_capacity|nRMSE_capacity|R\^2)_t\+(\d+):\s*([\d\.e\+\-]+|nan)", line)

                if match:
                    file_metric_key, horizon_str, value_str = match.groups()

                    if file_metric_key in metric_key_map:
                        metric_name = metric_key_map[file_metric_key]
                        horizon = int(horizon_str)

                        if horizon in metrics_by_horizon:
                            try:
                                value = float(value_str)
                                metrics_by_horizon[horizon][metric_name].append(value)
                            except ValueError:
                                metrics_by_horizon[horizon][metric_name].append(np.nan)
        except Exception as e:
            # This will now correctly report other errors, if any.
            print(f"Warning: Could not process file: {error_file_path}. Error: {e}")

# --- 3. Calculate Average Metrics ---
avg_metrics = {m: [] for m in ['nMAE', 'nRMSE', 'R^2']}
sorted_horizons = sorted(metrics_by_horizon.keys())

for horizon in sorted_horizons:
    for metric in avg_metrics.keys():
        values = [v for v in metrics_by_horizon[horizon][metric] if v is not None and not np.isnan(v)]
        if values:
            avg_metrics[metric].append(np.mean(values))
        else:
            avg_metrics[metric].append(np.nan)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.ticker import MaxNLocator

# --- 4. Create the Plot (Refined Version) ---

# Create figure with specified size and DPI
fig, ax1 = plt.subplots(figsize=(12, 8), dpi=100)
fig.patch.set_facecolor('white')

# Define professional-looking colors
color_nmae = '#003366'  # Dark blue
color_nrmse = '#3399FF' # Lighter blue
color_r2 = '#CC3300'    # Deep red/orange

# Plot nMAE and nRMSE on the primary y-axis (ax1)
ax1.set_xlabel('Forecast Horizon (hours)', fontsize=14)
ax1.set_ylabel('Normalized Error (nMAE, nRMSE)', color=color_nmae, fontsize=14)
line1 = ax1.plot(sorted_horizons, avg_metrics['nMAE'], marker='o', linestyle='-', color=color_nmae, label='Average nMAE')
line2 = ax1.plot(sorted_horizons, avg_metrics['nRMSE'], marker='s', linestyle='--', color=color_nrmse, label='Average nRMSE')
ax1.tick_params(axis='y', labelcolor=color_nmae)
ax1.set_facecolor('none')

# Create a second y-axis for R^2 (ax2)
ax2 = ax1.twinx()
ax2.set_ylabel('R² Score', color=color_r2, fontsize=14)
line3 = ax2.plot(sorted_horizons, avg_metrics['R^2'], marker='^', linestyle=':', color=color_r2, label='Average R²')
ax2.tick_params(axis='y', labelcolor=color_r2)
ax2.set_facecolor('none')

# Align Y-axis ticks to have the same number of "round"-valued grid lines
# MaxNLocator automatically finds nice, round numbers for the tick marks.
ax1.yaxis.set_major_locator(MaxNLocator(nbins=6, prune='both'))
ax2.yaxis.set_major_locator(MaxNLocator(nbins=6, prune='both'))

# Add a light, non-intrusive grid that now aligns perfectly
ax1.grid(True, which='major', linestyle='--', linewidth=0.5, color='gray', alpha=0.6)

# Hide top spines for a cleaner look
ax1.spines['top'].set_visible(False)
ax2.spines['top'].set_visible(False)

# Set title
plt.title('Average Model Performance vs. Forecast Horizon', fontsize=16, y=1.08)

# --- UPDATED: Combine legends and place on top ---
lines = line1 + line2 + line3
labels = [l.get_label() for l in lines]
ax1.legend(lines, labels,
           loc='lower center',             # Anchor point of the legend box
           bbox_to_anchor=(0.5, 1.02),      # Position the anchor point
           ncol=3,                         # Arrange legend items horizontally
           frameon=True,
           fontsize=12)

fig.tight_layout()
plt.xticks(sorted_horizons)
plt.savefig('plots_for_thesis/Performance_vs_Horizon.png', dpi=100)
plt.show()

# Reset matplotlib settings to default if you have other plots in the notebook
plt.rcdefaults()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# --- 1. Restructure Data for Box Plotting ---
# This cell assumes the 'metrics_by_horizon' dictionary is already populated.
# We convert it to a long-form DataFrame ideal for Seaborn.

plot_data = []
sorted_horizons = sorted(metrics_by_horizon.keys())

for horizon in sorted_horizons:
    if not metrics_by_horizon[horizon]['nMAE']:
        continue

    for metric_name, values in metrics_by_horizon[horizon].items():
        for value in values:
            if value is not None and not np.isnan(value):
                plot_data.append({
                    'Horizon': horizon,
                    'Metric': metric_name,
                    'Value': value
                })

plot_df = pd.DataFrame(plot_data)


# --- 2. Create the Box Plots with Averages ---
plt.style.use('default')
fig, axes = plt.subplots(2, 1, figsize=(14, 12), sharex=True, dpi=100)
fig.patch.set_facecolor('white')

# Define properties for the mean marker
mean_props = {
    "marker": "o",
    "markerfacecolor": "white",
    "markeredgecolor": "black",
    "markersize": "8"
}

# --- Top Subplot: nMAE and nRMSE Distributions ---
ax_top = axes[0]
sns.boxplot(x='Horizon', y='Value', hue='Metric',
            data=plot_df[plot_df['Metric'].isin(['nMAE', 'nRMSE'])],
            ax=ax_top,
            palette={'nMAE': '#003366', 'nRMSE': '#3399FF'},
            showmeans=True,      # This tells seaborn to show the mean
            meanprops=mean_props # This styles the mean marker
           )
ax_top.set_title('Distribution of Normalized Errors vs. Forecast Horizon', fontsize=16)
ax_top.set_ylabel('Normalized Error Value', fontsize=12)
ax_top.set_xlabel('')
ax_top.legend(title='Metric')
ax_top.grid(True, which='major', axis='y', linestyle='--', alpha=0.6)


# --- Bottom Subplot: R^2 Distribution ---
ax_bottom = axes[1]
sns.boxplot(x='Horizon', y='Value',
            data=plot_df[plot_df['Metric'] == 'R^2'],
            ax=ax_bottom,
            color='#CC3300',
            showmeans=True,      # Show the mean here as well
            meanprops=mean_props
           )
ax_bottom.set_title('Distribution of R² Score vs. Forecast Horizon', fontsize=16)
ax_bottom.set_ylabel('R² Score', fontsize=12)
ax_bottom.set_xlabel('Forecast Horizon (hours)', fontsize=14)
ax_bottom.grid(True, which='major', axis='y', linestyle='--', alpha=0.6)


# Final adjustments
plt.tight_layout(rect=[0, 0, 1, 0.98])
plt.savefig('plots_for_thesis/Box_Plot_Horizons.png', dpi=100)
plt.show()

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import re
import io

# --- Helper Function to Load Data ---
def load_3d_from_txt(file_path):
    """
    Loads a 3D NumPy array that was saved slice-by-slice into a text file.

    Args:
        file_path (str): The path to the text file.

    Returns:
        np.ndarray: The reconstructed 3D NumPy array.
    """
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")

    with open(file_path, 'r') as f:
        content = f.read()

    # Extract the original shape from the header comment
    shape_match = re.search(r'# Array shape: \((.*?)\)', content)
    if not shape_match:
        raise ValueError("Could not find array shape in file header.")
    shape = tuple(map(int, shape_match.group(1).split(',')))

    # Split the file content by the slice separator
    # We remove the header part before splitting
    header = "# Array shape: {0}\n".format(shape)
    data_content = content.replace(header, '')

    # Use a robust text block parser
    slices_str = data_content.split('# New slice\n')

    loaded_slices = []
    for s in slices_str:
        s_clean = s.strip()
        if s_clean: # Ensure the slice is not empty
            try:
                # Use numpy's loadtxt on the string block
                slice_arr = np.loadtxt(io.StringIO(s_clean))
                # Reshape if it's a single column vector
                if len(slice_arr.shape) == 1:
                    slice_arr = slice_arr.reshape(-1, 1)
                loaded_slices.append(slice_arr)
            except Exception as e:
                print(f"Skipping a malformed slice. Error: {e}")

    # Stack the 2D slices to form a 3D array
    if not loaded_slices:
        raise ValueError("No data slices were loaded from the file.")

    array_3d = np.stack(loaded_slices, axis=0)

    # Verify final shape
    if array_3d.shape != shape:
        print(f"Warning: Reconstructed shape {array_3d.shape} does not match header shape {shape}. Reshaping...")
        array_3d = array_3d.reshape(shape)

    return array_3d

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import re

# --- Configuration for the Plot ---
model_investigate_dir = '../model_investigate'

### AR vs SS models
# folder_name = 'LSTM_wf9_start2020-01-01_end2023-12-31_filtFalse_lag1_steps16_bs32_ep100_do0.4_lr0.01_cells[128_64_32]_vars[u100-v100-air_density]_dense1'
# folder_name = 'LSTM_wf9_start2020-01-01_end2023-12-31_filtFalse_lag1_steps16_bs32_ep500_do0.4_lr0.01_cells[512_128_64]_vars[u100-v100-air_density]_dense24'

### Best model's plots
folder_name = 'LSTM_wf58_start2020-01-01_end2023-12-31_filtFalse_lag1_steps4_bs64_ep100_do0.2_lr0.001_cells[256_128_64]_vars[u100-v100-air_density]_dense1'


# --- Automatic Discovery of Files and Parameters ---
try:
    investigation_path = os.path.join(model_investigate_dir, folder_name)
    pred_pattern = re.compile(r"y_pred_(single_shot|autoregressive|autoregressive_attention)_(\d+)\.txt")

    y_pred_filename, prediction_method_raw, prediction_method, forecast_horizon = None, None, None, None

    for filename in os.listdir(investigation_path):
        match = pred_pattern.match(filename)
        if match:
            y_pred_filename = filename
            prediction_method_raw = match.group(1)
            prediction_method = prediction_method_raw.replace('_', ' ').title()
            forecast_horizon = int(match.group(2))
            # Define all corresponding filenames
            y_true_filename = f"y_true_{prediction_method_raw}_{forecast_horizon}.txt"
            x_test_filename = f"x_test_{prediction_method_raw}_{forecast_horizon}.txt"
            break

    if not y_pred_filename:
        raise FileNotFoundError(f"Could not find a y_pred_* file in '{investigation_path}'")

    steps_match = re.search(r'_steps(\d+)_', folder_name)
    if not steps_match:
        raise ValueError(f"Could not extract time_steps from folder name: {folder_name}")
    time_steps = int(steps_match.group(1))

    print(f"Discovered files for '{prediction_method}' method.")
    print(f"Forecast Horizon: {forecast_horizon}, Input Time Steps: {time_steps}")

    # --- Load the Data (including X_test) ---
    y_true_path = os.path.join(investigation_path, y_true_filename)
    y_pred_path = os.path.join(investigation_path, y_pred_filename)
    x_test_path = os.path.join(investigation_path, x_test_filename)

    # Assumes load_3d_from_txt is defined in a previous cell
    y_true = load_3d_from_txt(y_true_path)
    y_pred = load_3d_from_txt(y_pred_path)
    X_test = load_3d_from_txt(x_test_path)

    print(f"Successfully loaded y_true with shape: {y_true.shape}")
    print(f"Successfully loaded y_pred with shape: {y_pred.shape}")
    print(f"Successfully loaded X_test with shape: {X_test.shape}")
    data_loaded = True

except (FileNotFoundError, ValueError, NameError) as e:
    print(f"An error occurred: {e}")
    print("Please ensure 'folder_name' is correct and 'load_3d_from_txt' is defined.")
    data_loaded = False

In [None]:

# --- Plotting the Time Series Comparison ---
if data_loaded:
    num_samples = y_true.shape[0]
    time_index = np.arange(num_samples) + time_steps
    color_actual = '#003366'
    color_pred = '#CC3300'

    if forecast_horizon == 1:
        # This part remains the same for single-step forecasts
        fig, ax = plt.subplots(figsize=(14, 7), dpi=100)
        fig.patch.set_facecolor('white')
        ax.plot(time_index, y_true[:, 0, 0], label='Actual', color=color_actual)
        ax.plot(time_index, y_pred[:, 0, 0], label='Predicted', color=color_pred, linewidth=0.8, alpha=0.9)
        ax.set_xlabel('Complete Sample Data Length)', fontsize=12)
        ax.set_ylabel('Production', fontsize=12)
        ax.set_title(f'Time Series: Actual vs. Predicted ({prediction_method})', fontsize=16)
        ax.legend(frameon=True, edgecolor='black')
        ax.grid(True, linestyle='--', alpha=0.6)
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        plt.tight_layout()
        output_path = os.path.join('plots_for_thesis', f'time_series_{prediction_method_raw}_{forecast_horizon}.png')
        plt.savefig(output_path)
        print(f"Plot saved to: {output_path}")
        plt.show()

    else: # --- MODIFIED: Multi-step forecast plotting in groups of 4 ---
        group_size = 4
        for group_index, start_h in enumerate(range(0, forecast_horizon, group_size)):
            fig, axes = plt.subplots(2, 2, figsize=(16, 10), sharex=True, dpi=100, facecolor='white')
            fig.suptitle(f'Time Series: Actual vs. Predicted ({prediction_method})', fontsize=20)
            axes = axes.ravel()

            end_h = min(start_h + group_size, forecast_horizon)

            for i, h in enumerate(range(start_h, end_h)):
                ax = axes[i]
                ax.plot(time_index, y_true[:, h, 0], label='Actual', color=color_actual)
                ax.plot(time_index, y_pred[:, h, 0], label='Predicted', color=color_pred, linewidth=0.7, alpha=0.9)
                ax.set_title(f'Forecast Horizon: t+{h + 1}', fontsize=14)
                ax.legend(frameon=True, edgecolor='black', fontsize=10)
                ax.grid(True, linestyle='--', alpha=0.6)
                ax.spines['top'].set_visible(False)
                ax.spines['right'].set_visible(False)
                # Add y-labels to the left-side plots
                if i % 2 == 0:
                    ax.set_ylabel('Production')

            # Hide any unused subplots in the last group
            for i in range(end_h - start_h, group_size):
                fig.delaxes(axes[i])

            fig.text(0.5, 0.01, 'Time Steps (from start of test period)', ha='center', va='center', fontsize=14)
            plt.tight_layout(rect=[0, 0.03, 1, 0.96])

            # Generate filename with letter suffix
            suffix = chr(ord('a') + group_index)
            output_filename = f'time_series_{prediction_method_raw}_{forecast_horizon}_{suffix}.png'
            output_path = os.path.join('plots_for_thesis', output_filename)
            plt.savefig(output_path)
            print(f"Plot saved to: {output_path}")

            #plt.show() # Show the plot for the current group
            plt.close(fig) # Close the figure to free up memory

In [None]:
# GIF for the presentation!!!
import os
import numpy as np
import matplotlib.pyplot as plt
import imageio


# --- Prerequisites ---
# This script assumes the following variables are already loaded and defined
# in your environment before this script is run:
#
# y_true: A numpy array with the ground truth time series data.
#         Shape: (num_samples, forecast_horizon, 1)
#
# y_pred: A numpy array with the predicted time series data.
#         Shape: (num_samples, forecast_horizon, 1)
#
# time_steps: An integer representing the number of initial time steps
#             not included in the test data.
#
# forecast_horizon: An integer for the number of steps to forecast.
#
# prediction_method: A string for the plot title (e.g., "LSTM Model").
#
# prediction_method_raw: A string for the filename (e.g., "LSTM_Model").
#
# data_loaded: A boolean flag, set to True if data is ready.

# Example of how you might define these (replace with your actual data loading)
# y_true, y_pred, time_steps, ... = load_my_data_function()
# data_loaded = True


# --- Plotting and GIF Generation ---
if 'data_loaded' in locals() and data_loaded:
    # --- Ensure required variables exist ---
    required_vars = ['y_true', 'y_pred', 'time_steps', 'forecast_horizon', 'prediction_method', 'prediction_method_raw']
    #if not all(var in locals() for var in required_vars):
    #    raise NameError("One or more required variables (y_true, y_pred, etc.) are not defined. Please load your data first.")

    num_samples = y_true.shape[0]
    time_index = np.arange(num_samples) + time_steps
    color_actual = '#003366'  # Dark Blue
    color_pred = '#CC3300'   # Dark Red

    # --- Directory Setup ---
    # Create directories to store the plots and the final GIF
    plots_dir = 'plots_for_gif'
    output_dir = 'plots_for_thesis'
    os.makedirs(plots_dir, exist_ok=True)
    os.makedirs(output_dir, exist_ok=True)

    image_files = []

    # --- Generate Individual Plots for each Forecast Horizon ---
    print("Generating individual plots for each forecast horizon...")
    for h in range(forecast_horizon):
        fig, ax = plt.subplots(figsize=(14, 7), dpi=120) # Increased DPI for better quality
        fig.patch.set_facecolor('white')

        # Plot actual vs. predicted data
        ax.plot(time_index, y_true[:, h, 0], label='Actual', color=color_actual, linewidth=2)
        ax.plot(time_index, y_pred[:, h, 0], label='Predicted', color=color_pred, linewidth=1.5, alpha=0.9)

        # --- Aesthetics and Labels ---
        ax.set_xlabel('Time Steps', fontsize=14)
        ax.set_ylabel('Production', fontsize=14)
        ax.set_title(f'Time Series: Actual vs. Predicted ({prediction_method})\nForecast Horizon: t+{h + 1}', fontsize=18)
        ax.legend(frameon=True, edgecolor='black', fontsize=12)
        ax.grid(True, which='both', linestyle='--', linewidth=0.5)

        # Improve spine visibility and ticks
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        ax.spines['left'].set_color('grey')
        ax.spines['bottom'].set_color('grey')
        ax.tick_params(axis='x', colors='grey', labelsize=12)
        ax.tick_params(axis='y', colors='grey', labelsize=12)

        # Determine consistent Y-axis limits across all plots for better comparison
        y_min = min(y_true.min(), y_pred.min())
        y_max = max(y_true.max(), y_pred.max())
        ax.set_ylim(y_min - 0.1 * abs(y_min), y_max + 0.1 * abs(y_max))

        plt.tight_layout()

        # Save the figure to a file
        filename = f"{plots_dir}/frame_{h:03d}.png"
        plt.savefig(filename)
        image_files.append(filename)
        plt.close(fig) # Close the figure to free up memory

    print(f"Successfully generated {len(image_files)} plot images in '{plots_dir}/'")


    # --- Create GIF from the saved plots ---
    gif_path = os.path.join(output_dir, f'time_series_{prediction_method_raw}_{forecast_horizon}h_forecast.gif')
    print(f"Creating GIF... Saving to {gif_path}")

    with imageio.get_writer(gif_path, mode='I', duration=500, loop=0) as writer: # duration in ms
        for filename in image_files:
            image = imageio.imread(filename)
            writer.append_data(image)

    print("GIF creation complete.")

    # --- Clean up the individual plot images ---
    print(f"Cleaning up temporary image files from '{plots_dir}/'...")
    for filename in image_files:
        os.remove(filename)
    os.rmdir(plots_dir)
    print("Cleanup complete.")

else:
    print("Data not loaded (the 'data_loaded' variable was not found or was False). Skipping plotting.")



In [None]:
import numpy as np
import matplotlib.pyplot as plt
import re
from sklearn.metrics import r2_score
from matplotlib.ticker import MaxNLocator
from src.DatabaseGetInfo import DatabaseAnalyzer

# --- Cell 3: Plotting Metrics vs. Forecast Horizon ---

if data_loaded:
    # --- Get Installed Capacity using DatabaseAnalyzer ---
    installed_capacity = None
    try:
        # Extract wf_id from the folder name to use in the query
        wf_id_match = re.search(r'_wf(\d+)_', folder_name)
        if not wf_id_match:
            raise ValueError("Could not extract wf_id from folder name.")
        wf_id = int(wf_id_match.group(1))

        # Use the project's DatabaseAnalyzer class
        analyzer = DatabaseAnalyzer.WindFarmAnalyzer(db_path='/home/wheatley/WFD/wfd.db')
        moe_data = analyzer.get_moe_data(wf_id)

        # Calculate installed capacity using the provided logic
        if not moe_data.empty and 'additional_unit_power_electrical' in moe_data.columns:
            installed_capacity = float(moe_data['additional_unit_power_electrical'].sum())
            print(f"Found installed capacity for wf_id {wf_id}: {installed_capacity} MW")
        else:
            print(f"Warning: No installed capacity data found via DatabaseAnalyzer for wf_id {wf_id}.")

    except Exception as e:
        print(f"An error occurred during database lookup: {e}")

    # --- Calculate Metrics ---
    r2_values, nrmse_values, nmae_values = [], [], []

    if installed_capacity and installed_capacity > 0:
        normalization_factor = installed_capacity
        print(f"Normalizing metrics using installed capacity: {normalization_factor} MW")
    else:
        normalization_factor = np.mean(y_true)
        if normalization_factor == 0: normalization_factor = 1
        print(f"Warning: Could not use installed capacity. Falling back to normalizing by the mean of y_true: {normalization_factor:.2f}")

    for h in range(forecast_horizon):
        y_true_h, y_pred_h = y_true[:, h, 0], y_pred[:, h, 0]
        r2 = r2_score(y_true_h, y_pred_h)
        mae = np.mean(np.abs(y_true_h - y_pred_h))
        rmse = np.sqrt(np.mean((y_true_h - y_pred_h)**2))

        r2_values.append(r2)
        nmae_values.append(mae / normalization_factor)
        nrmse_values.append(rmse / normalization_factor)

    # --- Create the Plot ---
    horizons = np.arange(1, forecast_horizon + 1)
    fig, ax1 = plt.subplots(figsize=(12, 8), dpi=100)
    fig.patch.set_facecolor('white')

    color_nmae, color_nrmse, color_r2 = '#3399FF', '#003366', '#CC3300'

    ax1.set_xlabel('Forecast Horizon (hours)', fontsize=14)
    ax1.set_ylabel('Normalized Error (nRMSE, nMAE)', color=color_nrmse, fontsize=14)
    # The plot calls (line1, line2, etc.) are what generate the handles and labels
    line1 = ax1.plot(horizons, nrmse_values, marker='s', linestyle='-', color=color_nrmse, label='nRMSE')
    line2 = ax1.plot(horizons, nmae_values, marker='^', linestyle='--', color=color_nmae, label='nMAE')
    ax1.tick_params(axis='y', labelcolor=color_nrmse)

    ax2 = ax1.twinx()
    ax2.set_ylabel('R² Score', color=color_r2, fontsize=14)
    line3 = ax2.plot(horizons, r2_values, marker='o', linestyle=':', color=color_r2, label='R²')
    ax2.tick_params(axis='y', labelcolor=color_r2)

    ax1.yaxis.set_major_locator(MaxNLocator(nbins=6, prune='both'))
    ax2.yaxis.set_major_locator(MaxNLocator(nbins=6, prune='both'))
    ax1.grid(True, which='major', linestyle='--', linewidth=0.5, color='gray', alpha=0.6)

    ax1.spines['top'].set_visible(False)
    ax2.spines['top'].set_visible(False)

    plt.title(f'Model Metrics vs. Forecast Horizon ({prediction_method})', fontsize=16, y=1.08)

    # --- LEGEND FIX ---
    # Get handles and labels from each axis individually
    h1, l1 = ax1.get_legend_handles_labels()
    h2, l2 = ax2.get_legend_handles_labels()

    # Combine them into single lists
    all_handles = h1 + h2
    all_labels = l1 + l2

    # Create the legend using the combined lists
    ax1.legend(all_handles, all_labels, loc='lower center', bbox_to_anchor=(0.5, 1.02), ncol=3, frameon=True, edgecolor='black', fontsize=12)

    fig.tight_layout()
    plt.xticks(horizons)

    # The 'match' object may not be defined if the discovery cell wasn't re-run, so we rebuild the method string
    method_str_for_filename = prediction_method.replace(' ', '_').lower()
    output_path = os.path.join('plots_for_thesis', f'metrics_vs_horizon_{method_str_for_filename}_{forecast_horizon}.png')
    plt.savefig(output_path)
    print(f"Metrics plot saved to: {output_path}")

    #plt.show()
    plt.close(fig)

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt

# This cell uses the variables loaded in the data discovery cell.
if data_loaded:
    # --- Configuration ---
    num_figures_to_generate = 32
    plots_per_figure = 4

    if forecast_horizon > 3:
        shift_amount = int(forecast_horizon / 4)
    else:
        shift_amount = 1
    print(f"Using a shift of {shift_amount} hours between subplots.")

    output_dir = os.path.join('plots_for_thesis', 'shifted_forecast_plots')
    os.makedirs(output_dir, exist_ok=True)
    print(f"Saving up to {num_figures_to_generate} plot groups to: {output_dir}")

    # --- REVISED: Select synchronized starting points using a fixed interval ---
    # Instead of linspace, we define a fixed interval (in hours/samples) between plot groups.
    # This ensures that the same time windows are chosen for AR and SS models.
    if forecast_horizon > 3:
        fixed_interval = shift_amount*3
    else:
        fixed_interval = 4

    # Generate a list of potential start indices
    potential_start_indices = np.arange(1, num_figures_to_generate * fixed_interval, fixed_interval)

    # Ensure the generated indices are valid for the current dataset
    total_samples = y_true.shape[0]
    required_span_for_group = (plots_per_figure - 1) * shift_amount

    # Filter the list to only include indices that are within the bounds of our data
    start_indices = [idx for idx in potential_start_indices if idx + required_span_for_group < total_samples]

    if not start_indices:
        print("Warning: Not enough data points to generate any forecast groups with the fixed intervals.")
    else:
        color_actual, color_pred = '#003366', '#CC3300'

        for fig_index, start_idx in enumerate(start_indices):
            fig, axes = plt.subplots(2, 2, figsize=(18, 12), sharex=True, sharey=True, dpi=100)
            fig.patch.set_facecolor('white')
            fig.suptitle(f'Forecast Examples for {prediction_method}, Starting at t={start_idx + time_steps} (from sample size)', fontsize=22, y=0.99)
            axes = axes.ravel()

            for i in range(plots_per_figure):
                ax = axes[i]
                current_pred_index = start_idx + (i * shift_amount)

                # Get t+0 value from the previous y_true sample
                last_known_true_value = y_true[current_pred_index - 1, 0, 0]

                y_true_slice = y_true[current_pred_index, :, 0]
                y_true_extended = np.concatenate(([last_known_true_value], y_true_slice))

                y_pred_slice = y_pred[current_pred_index, :, 0]
                y_pred_with_nan = np.concatenate(([np.nan], y_pred_slice))

                horizon_axis_extended = np.arange(0, forecast_horizon + 1)

                # Plot the extended data
                ax.plot(horizon_axis_extended, y_true_extended, label='Actual', color=color_actual, marker='^', markersize=5)
                ax.plot(horizon_axis_extended, y_pred_with_nan, label='Predicted', color=color_pred, marker='o', markersize=5, linewidth=1.2, alpha=0.9)

                ax.set_title(f'Forecast Made at t={current_pred_index + time_steps}', fontsize=14)
                ax.grid(True, linestyle='--', alpha=0.6)
                ax.spines['top'].set_visible(False)
                ax.spines['right'].set_visible(False)
                ax.set_ylim(0, np.max(y_true) * 1.05)
                ax.set_xlim(0, forecast_horizon + 1)

            handles, labels = ax.get_legend_handles_labels()
            fig.legend(handles, labels, loc='lower center', bbox_to_anchor=(0.5, 0.92), ncol=2, fontsize=14, frameon=True, edgecolor='black')
            fig.text(0.5, 0.02, 'Time Steps from Last Known Value (t+h)', ha='center', va='center', fontsize=16)
            fig.text(0.01, 0.5, 'Production', ha='center', va='center', rotation='vertical', fontsize=16)
            plt.tight_layout(rect=[0.03, 0.05, 1, 0.92])

            output_filename = f'shifted_{prediction_method_raw}_group_{fig_index + 1}_start_t{start_idx + time_steps}.png'
            output_path = os.path.join(output_dir, output_filename)
            plt.savefig(output_path)
            plt.close(fig)

        print(f"\nFinished generating and saving {len(start_indices)} plot groups.")

In [None]:
# GIF FOR PRESENTATION
import os
import numpy as np
import matplotlib.pyplot as plt
import imageio

# --- Prerequisites ---
# This script assumes the following variables are already loaded and defined
# in your environment before this script is run:
#
# y_true: A numpy array with the ground truth time series data.
#         Shape: (num_samples, forecast_horizon, 1)
#         y_true[i, h, 0] is the actual value at time (i + h + time_steps).
#
# y_pred: A numpy array with the predicted time series data.
#         Shape: (num_samples, forecast_horizon, 1)
#         y_pred[i, h, 0] is the prediction for time (i + h + time_steps).
#
# time_steps: An integer representing the number of initial time steps
#             not included in the test data.
#
# forecast_horizon: An integer for the number of steps to forecast.
#
# prediction_method: A string for the plot title (e.g., "LSTM Model").
#
# prediction_method_raw: A string for the filename (e.g., "LSTM_Model").
#
# data_loaded: A boolean flag, set to True if data is ready.


# --- Plotting and GIF Generation ---
if 'data_loaded' in locals() and data_loaded:
    # --- Ensure required variables exist ---
    required_vars = ['y_true', 'y_pred', 'time_steps', 'forecast_horizon', 'prediction_method', 'prediction_method_raw']
    #if not all(var in locals() for var in required_vars):
     #   raise NameError("One or more required variables (y_true, y_pred, etc.) are not defined. Please load your data first.")

    num_samples = y_true.shape[0]

    # This is the x-axis for the start of each forecast
    time_index = np.arange(num_samples) + time_steps

    color_history = '#003366'  # Dark Blue
    color_forecast = '#CC3300'   # Dark Red
    color_future_true = '#006633' # Dark Green

    # --- Directory Setup ---
    plots_dir = 'plots_for_rolling_gif'
    output_dir = 'plots_for_thesis'
    os.makedirs(plots_dir, exist_ok=True)
    os.makedirs(output_dir, exist_ok=True)

    image_files = []

    # --- Set consistent plot limits for a smooth animation ---
    y_min = min(y_true.min(), y_pred.min())
    y_max = max(y_true.max(), y_pred.max())
    y_padding = 0.1 * (y_max - y_min)
    plot_ylim = (y_min - y_padding, y_max + y_padding)

    # X-axis will go from the start of the data to the end of the last possible forecast
    plot_xlim = (time_index[0], time_index[-1] + forecast_horizon)

    # --- Generate Individual Plots for each Time Step ---
    print(f"Generating {num_samples} individual plots for the rolling forecast animation...")

    # We loop through each point in time where a forecast was made
    for i in range(num_samples):
        # Stop before the end if we don't have enough future data to plot against the forecast
        if i + forecast_horizon > num_samples:
            break

        fig, ax = plt.subplots(figsize=(14, 7), dpi=100) # dpi=100 for faster generation
        fig.patch.set_facecolor('white')

        # --- 1. Plot Historical Data ---
        # The history is all true values up to the point of forecast (using the t+1 truth)
        historical_x = time_index[:i+1]
        historical_y = y_true[:i+1, 0, 0] # Using h=0 as the "true" line
        ax.plot(historical_x, historical_y, label='Historical Production', color=color_history, linewidth=2)

        # --- 2. Plot the Forecast ---
        forecast_x = time_index[i] + np.arange(forecast_horizon)
        forecast_y = y_pred[i, :, 0]
        ax.plot(forecast_x, forecast_y, label='Forecast', color=color_forecast, linewidth=2, linestyle='--')

        # --- 3. Plot the Future Ground Truth for comparison ---
        # The actual values that the forecast was trying to predict
        future_true_y = []
        for h in range(forecast_horizon):
            future_true_y.append(y_true[i + h, 0, 0]) # The t+1 truth at future points

        # A more direct way using the data structure: y_true[i,:,0] are the true values for the forecast from i
        future_true_y = y_true[i, :, 0]

        ax.plot(forecast_x, future_true_y, label='Future Actual', color=color_future_true, linewidth=2, linestyle=':')

        # --- 4. Add a vertical line to show the "present" ---
        ax.axvline(x=time_index[i], color='grey', linestyle='--', linewidth=1.5, label=f'Forecast Point (t={time_index[i]})')

        # --- Aesthetics and Labels ---
        ax.set_xlabel('Time Steps', fontsize=14)
        ax.set_ylabel('Production', fontsize=14)
        ax.set_title(f'Rolling Forecast Animation ({prediction_method})', fontsize=18)
        ax.legend(frameon=True, edgecolor='black', fontsize=10, loc='upper left')
        ax.grid(True, which='both', linestyle='--', linewidth=0.5)

        ax.set_ylim(plot_ylim)
        ax.set_xlim(plot_xlim)

        plt.tight_layout()

        # Save the figure to a file
        filename = f"{plots_dir}/frame_{i:04d}.png" # Use 4 digits for potentially many frames
        plt.savefig(filename)
        image_files.append(filename)
        plt.close(fig) # Close the figure to free up memory

    print(f"Successfully generated {len(image_files)} plot images in '{plots_dir}/'")

    # --- Create GIF from the saved plots ---
    gif_path = os.path.join(output_dir, f'rolling_forecast_{prediction_method_raw}.gif')
    print(f"Creating GIF... Saving to {gif_path}")

    # Duration is now 100ms for a faster pace (10 FPS)
    with imageio.get_writer(gif_path, mode='I', duration=100, loop=0) as writer:
        for filename in image_files:
            image = imageio.imread(filename)
            writer.append_data(image)

    print("GIF creation complete.")

    # --- Clean up the individual plot images ---
    print(f"Cleaning up temporary image files from '{plots_dir}/'...")
    for filename in image_files:
        os.remove(filename)
    os.rmdir(plots_dir)
    print("Cleanup complete.")

else:
    print("Data not loaded (the 'data_loaded' variable was not found or was False). Skipping plotting.")
