## Experiment description

The goal of this notebook is to preprocess all datasets in Wind Farm C to a binary format that can be used for training a TM Classifier.

In [17]:
# Import all required libraries

import pandas as pd
import numpy as np

In [18]:
exclude_columns = ["time_stamp", "asset_id", "id"],


def load_df_and_annotate_anomalies(farm, event_id):
    path = f"../../../data/care_to_compare/Wind Farm {farm}/datasets/{event_id}.csv"
    df = pd.read_csv(path, delimiter=';')

    event_info = pd.read_csv(f"../../../data/care_to_compare/Wind Farm {farm}/event_info.csv", delimiter=';')

    # Find the row where event_id = event_id
    metadata = event_info[event_info['event_id'] == event_id]

    event_label = metadata["event_label"].values[0]
    event_start_id = metadata["event_start_id"].values[0]
    event_end_id = metadata["event_end_id"].values[0]

    label_value = 1 if event_label == "anomaly" else 0

    # All rows where the column "id" is between event_start_id and event_end_id
    df['label'] = 0
    df.loc[(df['id'] >= event_start_id) & (df['id'] <= event_end_id), 'label'] = label_value

    # Include all columns except for the ones in exclude_columns
    df = df[[col for col in df.columns if col not in exclude_columns]]

    return df


def split_dataset(data: pd.DataFrame):
    # Only include rows where the column "train_test" is "train"

    df_prediction = data[data['train_test'] == 'prediction']
    df_train = data[data['train_test'] == 'train']

    df_anomalies = df_prediction[df_prediction['label'] == 1]
    df_normal = df_train[df_train['label'] == 0]

    # Take as many normal samples as there are anomalies
    n = len(df_anomalies)

    df_normal = df_normal.sample(n=n)

    # Combine the two dataframes
    df = pd.concat([df_normal, df_anomalies])

    return df

In [19]:
def calculate_min_max(farm, dataset_ids):
    # For each column get the min and max value
    min_max_values = {}

    for dataset_id in dataset_ids:
        df = load_df_and_annotate_anomalies(farm, dataset_id)

        for col in df.columns:
            min_val = df[col].min()
            max_val = df[col].max()

            if col not in min_max_values:
                min_max_values[col] = (min_val, max_val)
            else:
                current_min, current_max = min_max_values[col]
                min_max_values[col] = (min(min_val, current_min), max(max_val, current_max))

    return min_max_values

In [20]:
def convert_to_10bit_integers(df, minmax):
    normalized_df = df.copy()

    for col in df.columns:
        min_val = minmax[col][0]
        max_val = minmax[col][1]

        # Edge case where all values are 0
        if min_val == max_val:
            normalized_df[col] = 0
        else:
            normalized_df[col] = ((df[col] - min_val) / (max_val - min_val) * 1023)

    # Convert the normalized values to integers
    int_df = normalized_df.astype(int)

    # Flatten each row into an array of 10-bit integers
    int_arrays = int_df.apply(lambda row: row.values.flatten(), axis=1).tolist()

    # Represent each cell as a 10-bit integer string
    bin_arrays = [[f"{cell:010b}" for cell in row] for row in int_arrays]

    # Split each 10-bit integer string into individual integers for each row
    # preserve the columns of bin_arrays
    bin_int_arrays = [[int(cell) for cell in list(''.join(row))] for row in bin_arrays]

    # Convert to numpy array
    int_arrays = np.array(bin_int_arrays)

    return int_arrays

In [21]:
def binarize_dataset_for_training(farm, event_id, output_path, min_max_values):
    # Load original dataset from file
    df = load_df_and_annotate_anomalies(farm, event_id)
    df = split_dataset(df)

    # Split into data and labels
    X_values = df.drop(columns=['label', 'train_test', 'status_type_id'])
    y_values = df['label']

    X_values = X_values.apply(pd.to_numeric, errors='coerce')
    X_values = X_values.dropna(axis=1)

    X_values_bin = convert_to_10bit_integers(X_values, min_max_values).astype(np.uint32)
    y_values_bin = y_values.values.astype(np.uint32)

    # Output to file using np
    np.savetxt(f"{output_path}/X_{farm}_{event_id}_10b.txt", X_values_bin, fmt='%d')
    np.savetxt(f"{output_path}/y_{farm}_{event_id}_10b.txt", y_values_bin, fmt='%d')

    num_1s = np.count_nonzero(y_values_bin == 1)
    num_0s = np.count_nonzero(y_values_bin == 0)

    print(f"Saved {event_id} to {output_path}")
    print(f"Number of 1s: {num_1s}, Number of 0s: {num_0s}")


def binarize_dataset_for_testing(farm, event_id, output_path, min_max_values):
    # Load original dataset from file
    df = load_df_and_annotate_anomalies(farm, event_id)

    # Split into data and labels
    X_values = df.drop(columns=['label', 'train_test'])
    y_values = df['label']
    z_values = df['status_type_id']

    X_values = X_values.apply(pd.to_numeric, errors='coerce')
    X_values = X_values.dropna(axis=1)

    X_values_bin = convert_to_10bit_integers(X_values, min_max_values).astype(np.uint32)
    y_values_bin = y_values.values.astype(np.uint32)
    z_valued_bin = z_values.values.astype(np.uint32)

    # Output to file using np
    np.savetxt(f"{output_path}/X_{farm}_{event_id}_10b.txt", X_values_bin, fmt='%d')
    np.savetxt(f"{output_path}/y_{farm}_{event_id}_10b.txt", y_values_bin, fmt='%d')
    np.savetxt(f"{output_path}/z_{farm}_{event_id}_10b.txt", z_valued_bin, fmt='%d')

    num_1s = np.count_nonzero(y_values_bin == 1)
    num_0s = np.count_nonzero(y_values_bin == 0)

    print(f"Saved {event_id} to {output_path}")
    print(f"Number of 1s: {num_1s}, Number of 0s: {num_0s}")

In [22]:
train_datasets = [49, 31, 67, 9, 91, 5, 90, 70, 35, 16, 76]
test_datasets = [
    55, 81, 47, 12, 4, 18, 28, 39, 66, 15, 78, 79, 30, 33, 11, 44,  # Has anomalies
]

#     8, 85, 6, 62, 36, 56, 94, 54, 43, 50, 64, 46, 65, 61, 93, 75, 41, 58, 48, 88, 57, 32, 89, 59, 63, 80, 37, 29, 1, 20, 60  # Without anomalies

In [23]:
min_max_values_dict = calculate_min_max("C", train_datasets + test_datasets)

# Save to file
with open("min_max_values.txt", "w") as f:
    for key, value in min_max_values_dict.items():
        f.write(f"{key}: {value}\n")

In [24]:
for dataset in test_datasets:
    binarize_dataset_for_testing("C", dataset, "./test_data", min_max_values_dict)

Saved 55 to ./test_data
Number of 1s: 2473, Number of 0s: 53280
Saved 81 to ./test_data
Number of 1s: 364, Number of 0s: 53568
Saved 47 to ./test_data
Number of 1s: 713, Number of 0s: 53280
Saved 12 to ./test_data
Number of 1s: 3259, Number of 0s: 52848


KeyboardInterrupt: 

In [28]:
for dataset in train_datasets:
    binarize_dataset_for_training("C", dataset, "./train_data", min_max_values_dict)

Saved 49 to ./data
Number of 1s: 598, Number of 0s: 598
Saved 31 to ./data
Number of 1s: 1008, Number of 0s: 1008
Saved 67 to ./data
Number of 1s: 576, Number of 0s: 576
Saved 9 to ./data
Number of 1s: 432, Number of 0s: 432
Saved 91 to ./data
Number of 1s: 1152, Number of 0s: 1152
Saved 5 to ./data
Number of 1s: 144, Number of 0s: 144
Saved 90 to ./data
Number of 1s: 576, Number of 0s: 576
Saved 70 to ./data
Number of 1s: 576, Number of 0s: 576
Saved 35 to ./data
Number of 1s: 288, Number of 0s: 288
Saved 16 to ./data
Number of 1s: 144, Number of 0s: 144
Saved 76 to ./data
Number of 1s: 246, Number of 0s: 246
