## Experiment description

The goal of this notebook is to preprocess all datasets in Wind Farm C to a binary format that can be used for training a TM Classifier.

In [166]:
# Import all required libraries

import pandas as pd
import numpy as np
import os

In [167]:
wind_farm = "A"
train_datasets = [68, 22, 72, 73, 0, 26, 40, 42, 10, 45, 84, 25, 69, 13, 24, 3, 17, 38, 71, 14, 92, 51]
test_datasets = [68, 22, 72, 73, 0, 26, 40, 42, 10, 45, 84, 25, 69, 13, 24, 3, 17, 38, 71, 14, 92, 51]

In [168]:
# Create folders data_test and data_train if they do not exist
os.makedirs("data_test", exist_ok=True)
os.makedirs("data_train", exist_ok=True)

In [169]:
exclude_columns = ["time_stamp", "asset_id", "id"]


def load_df_and_annotate_anomalies(farm, event_id):
    path = f"../../../data/care_to_compare/Wind Farm {farm}/datasets/{event_id}.csv"
    df = pd.read_csv(path, delimiter=';')

    event_info = pd.read_csv(f"../../../data/care_to_compare/Wind Farm {farm}/event_info.csv", delimiter=';')

    # Find the row where event_id = event_id
    metadata = event_info[event_info['event_id'] == event_id]

    event_label = metadata["event_label"].values[0]
    event_start_id = metadata["event_start_id"].values[0]
    event_end_id = metadata["event_end_id"].values[0]

    label_value = 1 if event_label == "anomaly" else 0

    # All rows where the column "id" is between event_start_id and event_end_id
    df['label'] = 0
    df.loc[(df['id'] >= event_start_id) & (df['id'] <= event_end_id), 'label'] = label_value

    # Include all columns except for the ones in exclude_columns
    df = df[[col for col in df.columns if col not in exclude_columns]]

    # Remove columns with suffixes in exclude_columns_with_suffix
    df = df[[col for col in df.columns if not col.endswith('_max')]]
    df = df[[col for col in df.columns if not col.endswith('_min')]]
    df = df[[col for col in df.columns if not col.endswith('_std')]]

    # Replace inf values with NaN and drop rows with NaN values
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.dropna(inplace=True)

    return df

In [170]:
def calculate_min_max(farm, dataset_ids):
    # For each column get the min and max value
    min_max_values = {}

    for dataset_id in dataset_ids:
        df = load_df_and_annotate_anomalies(farm, dataset_id)

        for col in df.columns:
            min_val = df[col].min()
            max_val = df[col].max()

            if col not in min_max_values:
                min_max_values[col] = (min_val, max_val)
            else:
                current_min, current_max = min_max_values[col]
                min_max_values[col] = (min(min_val, current_min), max(max_val, current_max))

    return min_max_values

In [171]:
def convert_to_bit_integers(df, minmax, bits_per_column=8):
    normalized_df = df.copy()

    for col in df.columns:
        min_val = minmax[col][0]
        max_val = minmax[col][1]

        # If all values are 0, then set all values to 0
        if min_val == 0 and max_val == 0:
            normalized_df[col] = 0
            continue

        shifted = normalized_df[col] - min_val + 1
        log_data = np.log1p(shifted)

        global_log_min = np.log1p(1)  # log1p(1) because shift guarantees min = 1
        global_log_max = np.log1p(max_val - min_val + 1)  # Max in transformed space

        normalized_df[col] = (log_data - global_log_min) / (global_log_max - global_log_min) * (
                (2 ** bits_per_column) - 1)

    # Convert the normalized values to integers
    int_df = normalized_df.astype(int)

    # Flatten each row into an array of 10-bit integers
    int_arrays = int_df.apply(lambda row: row.values.flatten(), axis=1).tolist()

    # Represent each cell as a 10-bit integer string
    bin_arrays = [[f"{cell:0{bits_per_column}b}" for cell in row] for row in int_arrays]

    # Split each 10-bit integer string into individual integers for each row
    # preserve the columns of bin_arrays
    bin_int_arrays = [[int(cell) for cell in list(''.join(row))] for row in bin_arrays]

    # Convert to numpy array
    int_arrays = np.array(bin_int_arrays)

    return int_arrays


def convert_to_bit_integers_minmax(df, minmax, bits_per_column=8):
    normalized_df = df.copy()

    for col in df.columns:
        min_val = minmax[col][0]
        max_val = minmax[col][1]

        # If all values are the same, set all to 0
        if min_val == max_val:
            normalized_df[col] = 0
            continue

        # Min-Max normalization
        normalized_df[col] = (normalized_df[col] - min_val) / (max_val - min_val)
        normalized_df[col] *= (2 ** bits_per_column) - 1

    # Convert the normalized values to integers
    int_df = normalized_df.astype(int)

    # Flatten each row into an array of bit integers
    int_arrays = int_df.apply(lambda row: row.values.flatten(), axis=1).tolist()

    # Convert each integer to a binary string of fixed length
    bin_arrays = [[f"{cell:0{bits_per_column}b}" for cell in row] for row in int_arrays]

    # Convert binary strings into individual bits
    bin_int_arrays = [[int(bit) for bit in ''.join(row)] for row in bin_arrays]

    # Convert to numpy array
    int_arrays = np.array(bin_int_arrays)

    return int_arrays

In [172]:
def binarize_dataset_for_training(farm, event_id, output_path, min_max_values, bits_per_column=8):
    # Load original dataset from file
    df = load_df_and_annotate_anomalies(farm, event_id)
    df = df[df['train_test'] == 'train']

    # Remove all rows where status_type_id is not 0 or 2
    df = df[df['status_type_id'].isin([0])]

    # Split into data and labels
    X_values = df.drop(columns=['label', 'train_test', 'status_type_id'])
    y_values = df['label']

    X_values = X_values.apply(pd.to_numeric, errors='coerce')
    X_values = X_values.dropna(axis=1)

    # Print number of columns
    print(f"Number of features: {len(X_values.columns)}")

    X_values_bin = convert_to_bit_integers(X_values, min_max_values, bits_per_column).astype(np.uint32)
    y_values_bin = y_values.values.astype(np.uint32)

    print(f"Number of columns: {X_values_bin.shape[1]}")

    # Output to file using np
    np.savetxt(f"{output_path}/X_{farm}_{event_id}.txt", X_values_bin, fmt='%d')
    np.savetxt(f"{output_path}/y_{farm}_{event_id}.txt", y_values_bin, fmt='%d')

    num_1s = np.count_nonzero(y_values_bin == 1)
    num_0s = np.count_nonzero(y_values_bin == 0)

    print(f"Saved {event_id} to {output_path}")
    print(f"Number of 1s: {num_1s}, Number of 0s: {num_0s}")


def binarize_dataset_for_testing(farm, event_id, output_path, min_max_values, bits_per_column=8):
    # Load original dataset from file
    df = load_df_and_annotate_anomalies(farm, event_id)

    #df = df[df['train_test'] == 'prediction']

    #df = df[df['label'] == 1]

    # Split into data and labels
    X_values = df.drop(columns=['label', 'train_test', 'status_type_id'])
    y_values = df['label']
    z_values = df['status_type_id']

    X_values = X_values.apply(pd.to_numeric, errors='coerce')
    X_values = X_values.dropna(axis=1)

    # Print number of columns
    print(f"Number of features: {len(X_values.columns)}")

    X_values_bin = convert_to_bit_integers(X_values, min_max_values, bits_per_column).astype(np.uint32)
    y_values_bin = y_values.values.astype(np.uint32)
    z_valued_bin = z_values.values.astype(np.uint32)

    print(f"Number of columns: {X_values_bin.shape[1]}")

    # Output to file using np
    np.savetxt(f"{output_path}/X_{farm}_{event_id}.txt", X_values_bin, fmt='%d')
    np.savetxt(f"{output_path}/y_{farm}_{event_id}.txt", y_values_bin, fmt='%d')
    np.savetxt(f"{output_path}/z_{farm}_{event_id}.txt", z_valued_bin, fmt='%d')

    num_1s = np.count_nonzero(y_values_bin == 1)
    num_0s = np.count_nonzero(y_values_bin == 0)

    print(f"Saved {event_id} to {output_path}")
    print(f"Number of 1s: {num_1s}, Number of 0s: {num_0s}")

In [173]:
min_max_values_dict = calculate_min_max(wind_farm, test_datasets)

# Save to file
with open("min_max_values.txt", "w") as f:
    for key, value in min_max_values_dict.items():
        f.write(f"{key}: {value}\n")

In [174]:
bits = 8

In [175]:
for dataset in test_datasets:
    binarize_dataset_for_testing(wind_farm, dataset, "./data_test", min_max_values_dict, bits)

Number of features: 54
Number of columns: 432
Saved 68 to ./data_test
Number of 1s: 2014, Number of 0s: 52344
Number of features: 54
Number of columns: 432
Saved 22 to ./data_test
Number of 1s: 1004, Number of 0s: 52031
Number of features: 54
Number of columns: 432
Saved 72 to ./data_test
Number of 1s: 1009, Number of 0s: 53072
Number of features: 54
Number of columns: 432
Saved 73 to ./data_test
Number of 1s: 1009, Number of 0s: 53033
Number of features: 54
Number of columns: 432
Saved 0 to ./data_test
Number of 1s: 2011, Number of 0s: 52974
Number of features: 54
Number of columns: 432
Saved 26 to ./data_test
Number of 1s: 1009, Number of 0s: 52692
Number of features: 54
Number of columns: 432
Saved 40 to ./data_test
Number of 1s: 4506, Number of 0s: 51650
Number of features: 54
Number of columns: 432
Saved 42 to ./data_test
Number of 1s: 1007, Number of 0s: 52876
Number of features: 54
Number of columns: 432
Saved 10 to ./data_test
Number of 1s: 981, Number of 0s: 52608
Number of fe

In [176]:
for dataset in train_datasets:
    binarize_dataset_for_training(wind_farm, dataset, "./data_train", min_max_values_dict, bits)

Number of features: 54
Number of columns: 432
Saved 68 to ./data_train
Number of 1s: 0, Number of 0s: 40437
Number of features: 54
Number of columns: 432
Saved 22 to ./data_train
Number of 1s: 0, Number of 0s: 40804
Number of features: 54
Number of columns: 432
Saved 72 to ./data_train
Number of 1s: 0, Number of 0s: 40053
Number of features: 54
Number of columns: 432
Saved 73 to ./data_train
Number of 1s: 0, Number of 0s: 40828
Number of features: 54
Number of columns: 432
Saved 0 to ./data_train
Number of 1s: 0, Number of 0s: 39149
Number of features: 54
Number of columns: 432
Saved 26 to ./data_train
Number of 1s: 0, Number of 0s: 37604
Number of features: 54
Number of columns: 432
Saved 40 to ./data_train
Number of 1s: 0, Number of 0s: 39076
Number of features: 54
Number of columns: 432
Saved 42 to ./data_train
Number of 1s: 0, Number of 0s: 37405
Number of features: 54
Number of columns: 432
Saved 10 to ./data_train
Number of 1s: 0, Number of 0s: 36306
Number of features: 54
Number