## Experiment description

The goal of this notebook is to preprocess all datasets in Wind Farm C to a binary format that can be used for training a TM Classifier.

In [51]:
# Import all required libraries

import pandas as pd
import numpy as np
import os

In [52]:
wind_farm = "C"

available_datasets_df = pd.read_csv(f"../../../data/care_to_compare/Wind Farm {wind_farm}/event_info.csv",
                                    delimiter=';')

available_datasets = available_datasets_df["event_id"].values

print(available_datasets)

train_datasets = available_datasets

test_datasets = available_datasets


[55 81 47 12  4 18 28 39 66 15 78 79 30 33 11 44 49 31 67  9 91  5 90 70
 35 16 76  8 85  6 62 36 56 94 54 43 50 64 46 65 61 93 75 41 58 48 88 57
 32 89 59 63 80 37 29  1 20 60]


In [53]:
# Create folders data_test and data_train if they do not exist
os.makedirs("data_test", exist_ok=True)
os.makedirs("data_train", exist_ok=True)

In [54]:
exclude_columns = ["time_stamp", "asset_id", "id"]


def load_df_and_annotate_anomalies(farm, event_id):
    path = f"../../../data/care_to_compare/Wind Farm {farm}/datasets/{event_id}.csv"
    df = pd.read_csv(path, delimiter=';')

    event_info = pd.read_csv(f"../../../data/care_to_compare/Wind Farm {farm}/event_info.csv", delimiter=';')

    # Find the row where event_id = event_id
    metadata = event_info[event_info['event_id'] == event_id]

    event_label = metadata["event_label"].values[0]
    event_start_id = metadata["event_start_id"].values[0]
    event_end_id = metadata["event_end_id"].values[0]

    label_value = 1 if event_label == "anomaly" else 0

    # All rows where the column "id" is between event_start_id and event_end_id
    df['label'] = 0
    df.loc[(df['id'] >= event_start_id) & (df['id'] <= event_end_id), 'label'] = label_value

    # Include all columns except for the ones in exclude_columns
    df = df[[col for col in df.columns if col not in exclude_columns]]

    # Remove columns with suffixes in exclude_columns_with_suffix
    df = df[[col for col in df.columns if not col.endswith('_max')]]
    df = df[[col for col in df.columns if not col.endswith('_min')]]
    df = df[[col for col in df.columns if not col.endswith('_std')]]

    return df

In [55]:
def calculate_min_max(farm, dataset_ids):
    # For each column get the min and max value
    min_max_values = {}

    for dataset_id in dataset_ids:
        df = load_df_and_annotate_anomalies(farm, dataset_id)

        for col in df.columns:
            min_val = df[col].min()
            max_val = df[col].max()

            if col not in min_max_values:
                min_max_values[col] = (min_val, max_val)
            else:
                current_min, current_max = min_max_values[col]
                min_max_values[col] = (min(min_val, current_min), max(max_val, current_max))

    return min_max_values

In [56]:
def convert_to_bit_integers(df, minmax, bits_per_column=8):
    normalized_df = df.copy()

    for col in df.columns:
        min_val = minmax[col][0]
        max_val = minmax[col][1]

        shifted = normalized_df[col] - min_val + 1
        log_data = np.log1p(shifted)

        global_log_min = np.log1p(1)  # log1p(1) because shift guarantees min = 1
        global_log_max = np.log1p(max_val - min_val + 1)  # Max in transformed space

        normalized_df[col] = (log_data - global_log_min) / (global_log_max - global_log_min) * (
                (2 ** bits_per_column) - 1)

    # Convert the normalized values to integers
    int_df = normalized_df.astype(int)

    # Flatten each row into an array of 10-bit integers
    int_arrays = int_df.apply(lambda row: row.values.flatten(), axis=1).tolist()

    # Represent each cell as a 10-bit integer string
    bin_arrays = [[f"{cell:0{bits_per_column}b}" for cell in row] for row in int_arrays]

    # Split each 10-bit integer string into individual integers for each row
    # preserve the columns of bin_arrays
    bin_int_arrays = [[int(cell) for cell in list(''.join(row))] for row in bin_arrays]

    # Convert to numpy array
    int_arrays = np.array(bin_int_arrays)

    return int_arrays

In [57]:
def binarize_dataset_for_training(farm, event_id, output_path, min_max_values, bits_per_column=8):
    # Load original dataset from file
    df = load_df_and_annotate_anomalies(farm, event_id)
    df = df[df['train_test'] == 'train']

    # Remove all rows where status_type_id is not 0 or 2
    df = df[df['status_type_id'].isin([0, 2])]

    # Take 1000 random samples
    df = df.sample(n=4000)

    # Split into data and labels
    X_values = df.drop(columns=['label', 'train_test', 'status_type_id'])
    y_values = df['label']

    X_values = X_values.apply(pd.to_numeric, errors='coerce')
    X_values = X_values.dropna(axis=1)

    # Print number of columns
    print(f"Number of features: {len(X_values.columns)}")

    X_values_bin = convert_to_bit_integers(X_values, min_max_values, bits_per_column).astype(np.uint32)
    y_values_bin = y_values.values.astype(np.uint32)

    print(f"Number of columns: {X_values_bin.shape[1]}")

    # Output to file using np
    np.savetxt(f"{output_path}/X_{farm}_{event_id}.txt", X_values_bin, fmt='%d')
    np.savetxt(f"{output_path}/y_{farm}_{event_id}.txt", y_values_bin, fmt='%d')

    num_1s = np.count_nonzero(y_values_bin == 1)
    num_0s = np.count_nonzero(y_values_bin == 0)

    print(f"Saved {event_id} to {output_path}")
    print(f"Number of 1s: {num_1s}, Number of 0s: {num_0s}")


def binarize_dataset_for_testing(farm, event_id, output_path, min_max_values, bits_per_column=8):
    # Load original dataset from file
    df = load_df_and_annotate_anomalies(farm, event_id)

    df = df[df['train_test'] == 'prediction']

    # Split into data and labels
    X_values = df.drop(columns=['label', 'train_test', 'status_type_id'])
    y_values = df['label']
    z_values = df['status_type_id']

    X_values = X_values.apply(pd.to_numeric, errors='coerce')
    X_values = X_values.dropna(axis=1)

    # Print number of columns
    print(f"Number of features: {len(X_values.columns)}")

    X_values_bin = convert_to_bit_integers(X_values, min_max_values, bits_per_column).astype(np.uint32)
    y_values_bin = y_values.values.astype(np.uint32)
    z_valued_bin = z_values.values.astype(np.uint32)

    print(f"Number of columns: {X_values_bin.shape[1]}")

    # Output to file using np
    np.savetxt(f"{output_path}/X_{farm}_{event_id}.txt", X_values_bin, fmt='%d')
    np.savetxt(f"{output_path}/y_{farm}_{event_id}.txt", y_values_bin, fmt='%d')
    np.savetxt(f"{output_path}/z_{farm}_{event_id}.txt", z_valued_bin, fmt='%d')

    num_1s = np.count_nonzero(y_values_bin == 1)
    num_0s = np.count_nonzero(y_values_bin == 0)

    print(f"Saved {event_id} to {output_path}")
    print(f"Number of 1s: {num_1s}, Number of 0s: {num_0s}")

In [58]:
min_max_values_dict = calculate_min_max(wind_farm, test_datasets)

# Save to file
with open("min_max_values.txt", "w") as f:
    for key, value in min_max_values_dict.items():
        f.write(f"{key}: {value}\n")

In [59]:
bits = 5

In [60]:
for dataset in train_datasets:
    binarize_dataset_for_training(wind_farm, dataset, "./data_train", min_max_values_dict, bits)

Number of features: 238
Number of columns: 1190
Saved 55 to ./data_train
Number of 1s: 0, Number of 0s: 4000
Number of features: 238
Number of columns: 1190
Saved 81 to ./data_train
Number of 1s: 0, Number of 0s: 4000
Number of features: 238
Number of columns: 1190
Saved 47 to ./data_train
Number of 1s: 0, Number of 0s: 4000
Number of features: 238
Number of columns: 1190
Saved 12 to ./data_train
Number of 1s: 0, Number of 0s: 4000
Number of features: 238
Number of columns: 1190
Saved 4 to ./data_train
Number of 1s: 0, Number of 0s: 4000
Number of features: 238
Number of columns: 1190
Saved 18 to ./data_train
Number of 1s: 0, Number of 0s: 4000
Number of features: 238
Number of columns: 1190
Saved 28 to ./data_train
Number of 1s: 0, Number of 0s: 4000
Number of features: 238
Number of columns: 1190
Saved 39 to ./data_train
Number of 1s: 0, Number of 0s: 4000
Number of features: 238
Number of columns: 1190
Saved 66 to ./data_train
Number of 1s: 0, Number of 0s: 4000
Number of features: 

In [61]:
for dataset in test_datasets:
    binarize_dataset_for_testing(wind_farm, dataset, "./data_test", min_max_values_dict, bits)

Number of features: 238
Number of columns: 1190
Saved 55 to ./data_test
Number of 1s: 2473, Number of 0s: 720
Number of features: 238
Number of columns: 1190
Saved 81 to ./data_test
Number of 1s: 364, Number of 0s: 1008
Number of features: 238
Number of columns: 1190
Saved 47 to ./data_test
Number of 1s: 713, Number of 0s: 864
Number of features: 238
Number of columns: 1190
Saved 12 to ./data_test
Number of 1s: 3259, Number of 0s: 288
Number of features: 238
Number of columns: 1190
Saved 4 to ./data_test
Number of 1s: 2737, Number of 0s: 1152
Number of features: 238
Number of columns: 1190
Saved 18 to ./data_test
Number of 1s: 576, Number of 0s: 1152
Number of features: 238
Number of columns: 1190
Saved 28 to ./data_test
Number of 1s: 2926, Number of 0s: 432
Number of features: 238
Number of columns: 1190
Saved 39 to ./data_test
Number of 1s: 735, Number of 0s: 432
Number of features: 238
Number of columns: 1190
Saved 66 to ./data_test
Number of 1s: 943, Number of 0s: 864
Number of fea