## Experiment description

The goal of this notebook is to preprocess all datasets in Wind Farm C to a binary format that can be used for training a TM Classifier.

In [19]:
# Import all required libraries

import pandas as pd
import numpy as np

In [20]:
exclude_columns = ["time_stamp", "asset_id", "id"]


def load_df_and_annotate_anomalies(farm, event_id):
    path = f"../../../data/care_to_compare/Wind Farm {farm}/datasets/{event_id}.csv"
    df = pd.read_csv(path, delimiter=';')

    event_info = pd.read_csv(f"../../../data/care_to_compare/Wind Farm {farm}/event_info.csv", delimiter=';')

    # Find the row where event_id = event_id
    metadata = event_info[event_info['event_id'] == event_id]

    event_label = metadata["event_label"].values[0]
    event_start_id = metadata["event_start_id"].values[0]
    event_end_id = metadata["event_end_id"].values[0]

    label_value = 1 if event_label == "anomaly" else 0

    # All rows where the column "id" is between event_start_id and event_end_id
    df['label'] = 0
    df.loc[(df['id'] >= event_start_id) & (df['id'] <= event_end_id), 'label'] = label_value

    # Include all columns except for the ones in exclude_columns
    df = df[[col for col in df.columns if col not in exclude_columns]]

    # Remove columns with suffixes in exclude_columns_with_suffix
    df = df[[col for col in df.columns if not col.endswith('_max')]]
    df = df[[col for col in df.columns if not col.endswith('_min')]]
    df = df[[col for col in df.columns if not col.endswith('_std')]]

    return df

In [21]:
def calculate_min_max(farm, dataset_ids):
    # For each column get the min and max value
    min_max_values = {}

    for dataset_id in dataset_ids:
        df = load_df_and_annotate_anomalies(farm, dataset_id)

        for col in df.columns:
            min_val = df[col].min()
            max_val = df[col].max()

            if col not in min_max_values:
                min_max_values[col] = (min_val, max_val)
            else:
                current_min, current_max = min_max_values[col]
                min_max_values[col] = (min(min_val, current_min), max(max_val, current_max))

    return min_max_values

In [22]:
def convert_to_10bit_integers(df, minmax):
    normalized_df = df.copy()

    for col in df.columns:
        min_val = minmax[col][0]
        max_val = minmax[col][1]

        # Edge case where all values are 0
        if min_val == max_val:
            normalized_df[col] = 0
        else:
            normalized_df[col] = ((df[col] - min_val) / (max_val - min_val) * 255)

    # Convert the normalized values to integers
    int_df = normalized_df.astype(int)

    # Flatten each row into an array of 10-bit integers
    int_arrays = int_df.apply(lambda row: row.values.flatten(), axis=1).tolist()

    # Represent each cell as a 10-bit integer string
    bin_arrays = [[f"{cell:08b}" for cell in row] for row in int_arrays]

    # Split each 10-bit integer string into individual integers for each row
    # preserve the columns of bin_arrays
    bin_int_arrays = [[int(cell) for cell in list(''.join(row))] for row in bin_arrays]

    # Convert to numpy array
    int_arrays = np.array(bin_int_arrays)

    return int_arrays

In [23]:
def binarize_dataset_for_training(farm, event_id, output_path, min_max_values):
    # Load original dataset from file
    df = load_df_and_annotate_anomalies(farm, event_id)
    df = df[df['train_test'] == 'train']

    # Remove all rows where status_type_id is not 0 or 2
    df = df[df['status_type_id'].isin([0, 2])]

    # Split into data and labels
    X_values = df.drop(columns=['label', 'train_test', 'status_type_id'])
    y_values = df['label']

    X_values = X_values.apply(pd.to_numeric, errors='coerce')
    X_values = X_values.dropna(axis=1)

    # Print number of columns
    print(len(X_values.columns))

    X_values_bin = convert_to_10bit_integers(X_values, min_max_values).astype(np.uint32)
    y_values_bin = y_values.values.astype(np.uint32)

    # Output to file using np
    np.savetxt(f"{output_path}/X_{farm}_{event_id}.txt", X_values_bin, fmt='%d')
    np.savetxt(f"{output_path}/y_{farm}_{event_id}.txt", y_values_bin, fmt='%d')

    num_1s = np.count_nonzero(y_values_bin == 1)
    num_0s = np.count_nonzero(y_values_bin == 0)

    print(f"Saved {event_id} to {output_path}")
    print(f"Number of 1s: {num_1s}, Number of 0s: {num_0s}")


def binarize_dataset_for_testing(farm, event_id, output_path, min_max_values):
    # Load original dataset from file
    df = load_df_and_annotate_anomalies(farm, event_id)

    df = df[df['train_test'] == 'prediction']
    #df = df[df['label'] == 1]

    # Split into data and labels
    X_values = df.drop(columns=['label', 'train_test', 'status_type_id'])
    y_values = df['label']
    z_values = df['status_type_id']

    X_values = X_values.apply(pd.to_numeric, errors='coerce')
    X_values = X_values.dropna(axis=1)

    # Print number of columns
    print(len(X_values.columns))

    X_values_bin = convert_to_10bit_integers(X_values, min_max_values).astype(np.uint32)
    y_values_bin = y_values.values.astype(np.uint32)
    z_valued_bin = z_values.values.astype(np.uint32)

    # Output to file using np
    np.savetxt(f"{output_path}/X_{farm}_{event_id}.txt", X_values_bin, fmt='%d')
    np.savetxt(f"{output_path}/y_{farm}_{event_id}.txt", y_values_bin, fmt='%d')
    np.savetxt(f"{output_path}/z_{farm}_{event_id}.txt", z_valued_bin, fmt='%d')

    num_1s = np.count_nonzero(y_values_bin == 1)
    num_0s = np.count_nonzero(y_values_bin == 0)

    print(f"Saved {event_id} to {output_path}")
    print(f"Number of 1s: {num_1s}, Number of 0s: {num_0s}")

In [24]:
train_datasets = [34]
test_datasets = [34]

In [25]:
min_max_values_dict = calculate_min_max("B", train_datasets + test_datasets)

# Save to file
with open("min_max_values.txt", "w") as f:
    for key, value in min_max_values_dict.items():
        f.write(f"{key}: {value}\n")

In [26]:
for dataset in test_datasets:
    binarize_dataset_for_testing("B", dataset, "./data_test", min_max_values_dict)

63
Saved 34 to ./data_test
Number of 1s: 3169, Number of 0s: 864


In [27]:
for dataset in train_datasets:
    binarize_dataset_for_training("B", dataset, "./data_train", min_max_values_dict)

63
Saved 34 to ./data_train
Number of 1s: 0, Number of 0s: 43971
