## Experiment description

The goal of this notebook is to preprocess all datasets in Wind Farm C to a binary format that can be used for training a TM Classifier.

In [17]:
# Import all required libraries

import pandas as pd
import numpy as np
import os

In [18]:
wind_farm = "C"
train_datasets = [55, 81, 47, 12, 4, 18, 28, 39, 66, 15, 8, 79, 30]
test_datasets = [55, 81, 47, 12, 4, 18, 28, 39, 66, 15, 8, 79, 30, 33, 11, 44, 49, 31, 67, 9, 91, 5, 90, 70, 35, 16,
                 76, 8, 85, 6, 62, 36, 56, 94, 54, 43, 50, 64, 46, 65, 61, 93, 75, 41, 58, 48, 88, 57, 32, 89, 59, 63,
                 80, 37, 29, 1, 20, 60]


In [19]:
# Create folders data_test and data_train if they do not exist
os.makedirs("data_test", exist_ok=True)
os.makedirs("data_train", exist_ok=True)

In [20]:
exclude_columns = ["time_stamp", "asset_id", "id"]


def load_df_and_annotate_anomalies(farm, event_id):
    path = f"../../../data/care_to_compare/Wind Farm {farm}/datasets/{event_id}.csv"
    df = pd.read_csv(path, delimiter=';')

    event_info = pd.read_csv(f"../../../data/care_to_compare/Wind Farm {farm}/event_info.csv", delimiter=';')

    # Find the row where event_id = event_id
    metadata = event_info[event_info['event_id'] == event_id]

    event_label = metadata["event_label"].values[0]
    event_start_id = metadata["event_start_id"].values[0]
    event_end_id = metadata["event_end_id"].values[0]

    label_value = 1 if event_label == "anomaly" else 0

    # All rows where the column "id" is between event_start_id and event_end_id
    df['label'] = 0
    df.loc[(df['id'] >= event_start_id) & (df['id'] <= event_end_id), 'label'] = label_value

    # Include all columns except for the ones in exclude_columns
    df = df[[col for col in df.columns if col not in exclude_columns]]

    # Remove columns with suffixes in exclude_columns_with_suffix
    # df = df[[col for col in df.columns if not col.endswith('_avg')]]
    # df = df[[col for col in df.columns if not col.endswith('_min')]]
    # df = df[[col for col in df.columns if not col.endswith('_std')]]

    # Replace inf values with NaN and drop rows with NaN values
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.dropna(inplace=True)

    return df

In [21]:
def calculate_min_max(farm, dataset_ids):
    # For each column get the min and max value
    min_max_values = {}

    for dataset_id in dataset_ids:
        df = load_df_and_annotate_anomalies(farm, dataset_id)

        for col in df.columns:
            min_val = df[col].min()
            max_val = df[col].max()

            if col not in min_max_values:
                min_max_values[col] = (min_val, max_val)
            else:
                current_min, current_max = min_max_values[col]
                min_max_values[col] = (min(min_val, current_min), max(max_val, current_max))

    return min_max_values

In [22]:
def convert_to_bit_integers_log(df, minmax, bits_per_column=8):
    normalized_df = df.copy()

    for col in df.columns:
        min_val = minmax[col][0]
        max_val = minmax[col][1]

        # If all values are 0, then set all values to 0
        if min_val == 0 and max_val == 0:
            normalized_df[col] = 0
            continue

        shifted = normalized_df[col] - min_val + 1
        log_data = np.log1p(shifted)

        global_log_min = np.log1p(1)  # log1p(1) because shift guarantees min = 1
        global_log_max = np.log1p(max_val - min_val + 1)  # Max in transformed space

        normalized_df[col] = (log_data - global_log_min) / (global_log_max - global_log_min) * (
                (2 ** bits_per_column) - 1)

    # Convert the normalized values to integers
    int_df = normalized_df.astype(int)

    # Flatten each row into an array of 10-bit integers
    int_arrays = int_df.apply(lambda row: row.values.flatten(), axis=1).tolist()

    # Represent each cell as a 10-bit integer string
    bin_arrays = [[f"{cell:0{bits_per_column}b}" for cell in row] for row in int_arrays]

    # Split each 10-bit integer string into individual integers for each row
    # preserve the columns of bin_arrays
    bin_int_arrays = [[int(cell) for cell in list(''.join(row))] for row in bin_arrays]

    # Convert to numpy array
    int_arrays = np.array(bin_int_arrays)

    return int_arrays


def convert_to_bit_integers(df, minmax, bits_per_column=8):
    normalized_df = df.copy()

    for col in df.columns:
        min_val = minmax[col][0]
        max_val = minmax[col][1]

        # If all values are the same, set all to 0
        if min_val == max_val:
            normalized_df[col] = 0
            continue

        # Min-Max normalization
        normalized_df[col] = (normalized_df[col] - min_val) / (max_val - min_val)
        normalized_df[col] *= (2 ** bits_per_column) - 1

    # Convert the normalized values to integers
    int_df = normalized_df.astype(int)

    # Flatten each row into an array of bit integers
    int_arrays = int_df.apply(lambda row: row.values.flatten(), axis=1).tolist()

    # Convert each integer to a binary string of fixed length
    bin_arrays = [[f"{cell:0{bits_per_column}b}" for cell in row] for row in int_arrays]

    # Convert binary strings into individual bits
    bin_int_arrays = [[int(bit) for bit in ''.join(row)] for row in bin_arrays]

    # Convert to numpy array
    int_arrays = np.array(bin_int_arrays)

    return int_arrays

In [23]:
def binarize_dataset_for_training(farm, event_id, output_path, min_max_values, bits_per_column=8):
    # Load original dataset from file
    df = load_df_and_annotate_anomalies(farm, event_id)
    df = df[df['train_test'] == 'train']

    df = df[df['status_type_id'].isin([0, 2])]

    # Take a sample of 2000 rows
    df = df.sample(n=2000, random_state=42)

    # Split into data and labels
    X_values = df.drop(columns=['label', 'train_test', 'status_type_id'])

    #X_values = X_values.apply(pd.to_numeric, errors='coerce')

    X_values_bin = convert_to_bit_integers(X_values, min_max_values, bits_per_column).astype(np.uint32)

    # Output to file using np
    np.savetxt(f"{output_path}/X_{farm}_{event_id}.txt", X_values_bin, fmt='%d')

    label_df = pd.DataFrame({
        'label': df['label'].values,
        'status_type_id': df['status_type_id'].values,
        'train_test': df['train_test'].values
    })

    label_df.to_csv(f"{output_path}/y_{farm}_{event_id}.csv", index=False)

    print(f"Done with {event_id}: {X_values_bin.shape}")


def binarize_dataset_for_testing(farm, event_id, output_path, min_max_values, bits_per_column=8):
    # Load original dataset from file
    df = load_df_and_annotate_anomalies(farm, event_id)

    # Split into data and labels
    X_values = df.drop(columns=['label', 'train_test', 'status_type_id'])
    #X_values = X_values.apply(pd.to_numeric, errors='coerce')

    X_values_bin = convert_to_bit_integers(X_values, min_max_values, bits_per_column).astype(np.uint32)

    # Output to file using np
    np.savetxt(f"{output_path}/X_{farm}_{event_id}.txt", X_values_bin, fmt='%d')

    label_df = pd.DataFrame({
        'label': df['label'].values,
        'status_type_id': df['status_type_id'].values,
        'train_test': df['train_test'].values
    })

    label_df.to_csv(f"{output_path}/y_{farm}_{event_id}.csv", index=False)

    print(f"Done with {event_id}: {X_values_bin.shape}")


In [24]:
min_max_values_dict = calculate_min_max(wind_farm, test_datasets)

# Save to file
with open("min_max_values.txt", "w") as f:
    for key, value in min_max_values_dict.items():
        f.write(f"{key}: {value}\n")

KeyboardInterrupt: 

In [27]:
# Read min_max_values from file

min_max_values_dict = {}

with open("min_max_values.txt", "r") as f:
    for line in f:
        key, value = line.strip().split(": ")
        min_max_values_dict[key] = tuple(map(any, value.strip("()").split(",")))



In [28]:
bits = 5

In [29]:
for dataset in test_datasets:
    binarize_dataset_for_testing(wind_farm, dataset, "./data_test", min_max_values_dict, bits)

Done with 55: (55753, 4760)
Done with 81: (53932, 4760)
Done with 47: (53993, 4760)
Done with 12: (56107, 4760)
Done with 4: (56449, 4760)
Done with 18: (52848, 4760)
Done with 28: (55918, 4760)
Done with 39: (53727, 4760)
Done with 66: (53503, 4760)
Done with 15: (54433, 4760)
Done with 8: (54802, 4760)
Done with 79: (53281, 4760)
Done with 30: (56111, 4760)
Done with 33: (55873, 4760)
Done with 11: (56437, 4760)
Done with 44: (63003, 4760)
Done with 49: (53014, 4760)
Done with 31: (54589, 4760)
Done with 67: (61489, 4760)
Done with 9: (56029, 4760)
Done with 91: (56608, 4760)
Done with 5: (52795, 4760)
Done with 90: (54880, 4760)
Done with 70: (56038, 4760)
Done with 35: (52615, 4760)
Done with 16: (53568, 4760)
Done with 76: (52086, 4760)
Done with 8: (54802, 4760)
Done with 85: (52417, 4760)
Done with 6: (54865, 4760)
Done with 62: (53448, 4760)
Done with 36: (54440, 4760)
Done with 56: (53416, 4760)
Done with 94: (54865, 4760)
Done with 54: (55585, 4760)
Done with 43: (55153, 4760

In [30]:
for dataset in train_datasets:
    binarize_dataset_for_training(wind_farm, dataset, "./data_train", min_max_values_dict, bits)

Done with 55: (2000, 4760)
Done with 81: (2000, 4760)
Done with 47: (2000, 4760)
Done with 12: (2000, 4760)
Done with 4: (2000, 4760)
Done with 18: (2000, 4760)
Done with 28: (2000, 4760)
Done with 39: (2000, 4760)
Done with 66: (2000, 4760)
Done with 15: (2000, 4760)
Done with 8: (2000, 4760)
Done with 79: (2000, 4760)
Done with 30: (2000, 4760)
