# Construction of IO dataset for Power Flow BBMs


In [1]:
# Importing libraries
import os
from pathlib import Path
import dill as pickle
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tqdm

import greyboxmodels.cpsmodels.physical.electrical.PowerFlowPowerGrid as PG
import greyboxmodels.cpsmodels.Plant as Plant

# Set the working directory
os.chdir("D:/projects/IPTLC_BBMs")
print(f"Current working directory: {os.getcwd()}")

Current working directory: D:\projects\IPTLC_BBMs


In [2]:
# Data folder path
# data_folder = Path("D:/projects/Hierarchical_CPS_models/data/simulations/controlled_pg/20240227_195448")
data_folder = Path("D:/projects/CPS-SenarioGeneration/data/monte_carlo/controlled_power_grid/2024-03-20_18-55-20")

# Specify the path to save the processed data and create the folder if it doesn't exist
processed_data_folder = Path("data/IO-datasets/PF/") / data_folder.name
os.makedirs(processed_data_folder, exist_ok=True)

# Print the folders
print(f"Data folder: {data_folder}")
print(f"Processed data folder: {processed_data_folder}")

Data folder: D:\projects\CPS-SenarioGeneration\data\monte_carlo\controlled_power_grid\2024-03-20_18-55-20
Processed data folder: data\IO-datasets\PF\2024-03-20_18-55-20


In [3]:
# Create a function that receives a path to a simulation and returns the inputs and outputs
def get_pf_data(filepath: Path):
    # Open the data
    with open(filepath, "rb") as f:
        sim_data = pickle.load(f)

    # Get the inputs and outputs
    pf_inputs = np.array([x['power_grid_step_data']["power_flow_input"] for x in sim_data['step_data']])
    pf_outputs = np.array([x['power_grid_step_data']["power_flow_output"] for x in sim_data['step_data']])

    return pf_inputs, pf_outputs

# Test the function
target_simulation = data_folder / "simulation_0.pkl"
pf_inputs, pf_outputs = get_pf_data(target_simulation)
print(f"Testing the file: {target_simulation}")
print(f"CC inputs shape: {pf_inputs.shape}")
print(f"CC outputs shape: {pf_outputs.shape}")

Testing the file: D:\projects\CPS-SenarioGeneration\data\monte_carlo\controlled_power_grid\2024-03-20_18-55-20\simulation_0.pkl
CC inputs shape: (384, 57)
CC outputs shape: (384, 105)


In [5]:
# Now, a function that iterates over all the simulations and returns the inputs and outputs in a single numpy array
def get_pf_data_all(data_folder: Path):
    # Create an empty list to store the inputs and outputs
    inputs = []
    outputs = []

    # Create a list of the target folders: they are called "simulation_0.pkl", "simulation_1.pkl", etc.
    target_folders = [f for f in data_folder.iterdir() if f.is_file() and f.name.startswith("simulation")]

    # Iterate over all the simulations and get the inputs and outputs for each one
    for f in tqdm.tqdm(target_folders):
        try:
            # Get the inputs and outputs
            pf_inputs, pf_outputs = get_pf_data(f)

            # Append the inputs and outputs to the lists
            inputs.append(pf_inputs)
            outputs.append(pf_outputs)
        except:
            print(f"Error in file: {f}")

    # Concatenate the inputs and outputs
    inputs_matrix = np.concatenate(inputs, axis=0)
    outputs_matrix = np.concatenate(outputs, axis=0)

    # Get the plant
    with open(data_folder / "plant.pkl", "rb") as f:
        plant = pickle.load(f)

    return inputs_matrix, outputs_matrix, plant

# Test the function
pf_inputs, pf_outputs, plant = get_pf_data_all(data_folder)
print(f"CC inputs shape: {pf_inputs.shape}")
print(f"CC outputs shape: {pf_outputs.shape}")

  9%|▉         | 7/75 [00:07<01:02,  1.09it/s]

Error in file: D:\projects\CPS-SenarioGeneration\data\monte_carlo\controlled_power_grid\2024-03-20_18-55-20\simulation_14.pkl


 61%|██████▏   | 46/75 [00:36<00:20,  1.42it/s]

Error in file: D:\projects\CPS-SenarioGeneration\data\monte_carlo\controlled_power_grid\2024-03-20_18-55-20\simulation_5.pkl


 64%|██████▍   | 48/75 [00:37<00:17,  1.54it/s]

Error in file: D:\projects\CPS-SenarioGeneration\data\monte_carlo\controlled_power_grid\2024-03-20_18-55-20\simulation_51.pkl


 68%|██████▊   | 51/75 [00:40<00:16,  1.46it/s]

Error in file: D:\projects\CPS-SenarioGeneration\data\monte_carlo\controlled_power_grid\2024-03-20_18-55-20\simulation_54.pkl


 92%|█████████▏| 69/75 [00:53<00:04,  1.24it/s]

Error in file: D:\projects\CPS-SenarioGeneration\data\monte_carlo\controlled_power_grid\2024-03-20_18-55-20\simulation_70.pkl


100%|██████████| 75/75 [00:58<00:00,  1.28it/s]

CC inputs shape: (26874, 57)
CC outputs shape: (26874, 105)





In [6]:
# Save the inputs and outputs to numpy arrays
inputs_path = processed_data_folder / "pf_inputs.npy"
outputs_path = processed_data_folder / "pf_outputs.npy"

np.save(inputs_path, pf_inputs)
np.save(outputs_path, pf_outputs)

## Normalize the inputs and outputs

In [37]:
# Create a function to normalize an array as above
def min_max_normalize(array: np.ndarray, min_array: np.ndarray = None, max_array: np.ndarray = None):
    if min_array is None:
        min_array = array.min(axis=0)
        max_array = array.max(axis=0)

        min_array[min_array == max_array] = min_array[min_array == max_array] - 1
        max_array[min_array == max_array] = max_array[min_array == max_array]

    array_normalized = (array - min_array) / (max_array - min_array)

    return array_normalized, min_array, max_array

In [38]:
# Remove rows with NaN values in the inputs and outputs
nan_rows = np.isnan(pf_inputs).any(axis=1) | np.isnan(pf_outputs).any(axis=1)

pf_inputs_no_nan = pf_inputs[~nan_rows]
pf_outputs_no_nan = pf_outputs[~nan_rows]

In [39]:
# Inputs
pf_inputs_normalized, min_pf_input, max_pf_input = min_max_normalize(pf_inputs_no_nan)

pf_inputs_normalized.shape

(26660, 57)

In [40]:
# Outputs
pf_outputs_normalized, min_pf_output, max_pf_output = min_max_normalize(pf_outputs_no_nan)

pf_outputs_normalized.shape

(26660, 105)

In [41]:
# Save the normalized inputs and outputs to numpy arrays
inputs_normalized_path = processed_data_folder / "pf_inputs_minmax_normalized.npy"
outputs_normalized_path = processed_data_folder / "pf_outputs_minmax_normalized.npy"

np.save(inputs_normalized_path, pf_inputs_normalized)
np.save(outputs_normalized_path, pf_outputs_normalized)

# Also, save the min and max values
min_max_values = {
    "input_min": min_pf_input,
    "input_max": max_pf_input,
    "output_min": min_pf_output,
    "output_max": max_pf_output,
    "type": "min_max"
}

min_max_values_path = processed_data_folder / "norm_min_max_values.pkl"
with open(min_max_values_path, "wb") as f:
    pickle.dump(min_max_values, f)

## Get the ground truth data

In [53]:
# Choose a simulation
gt_sim = data_folder / "simulation_5.pkl"

# Open the minimum and maximum values
with open(min_max_values_path, "rb") as f:
    min_max_values = pickle.load(f)

min_pf_input = min_max_values["min_pf_input"]
max_pf_input = min_max_values["max_pf_input"]
min_pf_output = min_max_values["min_pf_output"]
max_pf_output = min_max_values["max_pf_output"]

# Do as above and save to a file
gt_inputs, gt_outputs = get_pf_data(gt_sim)

gt_inputs_normalized, _, _ = min_max_normalize(gt_inputs, min_pf_input, max_pf_input)
gt_outputs_normalized, _, _ = min_max_normalize(gt_outputs, min_pf_output, max_pf_output)

gt_inputs_normalized_path = processed_data_folder / "gt_inputs_minmax_normalized.npy"
gt_outputs_normalized_path = processed_data_folder / "gt_outputs_minmax_normalized.npy"

np.save(gt_inputs_normalized_path, gt_inputs_normalized)
np.save(gt_outputs_normalized_path, gt_outputs_normalized)