In [1]:
# Import all required libraries

import pandas as pd
import numpy as np
import os

In [2]:
# Set seed for reproducibility
np.random.seed(42)

In [3]:
import numpy as np


def binarize_dataset_with_hdc(dataset, D=2000, Q=8, n_gram=3, show_details=False):
    """
    Binarize a multi-feature dataset using Hyperdimensional Computing (HDC).

    Parameters:
        dataset (np.ndarray): Dataset with shape [num_samples, num_features]
        D (int): Dimension of hypervectors
        Q (int): Number of quantization levels
        n_gram (int): Size of N-gram window for sequence encoding
        show_details (bool): Whether to print detailed output

    Returns:
        np.ndarray: Array of binary hypervectors, one for each sample
    """

    num_samples, num_features = dataset.shape

    if show_details:
        print(f"Dataset shape: {dataset.shape}")
        print(f"Number of samples: {num_samples}")
        print(f"Number of features: {num_features}")

    # HDC operations
    def generate_random_hypervector(D):
        """Generate a random binary hypervector of dimension D"""
        return np.random.randint(0, 2, D, dtype=np.uint8)

    def bind(hv1, hv2):
        """Binding operation (XOR)"""
        return np.logical_xor(hv1, hv2).astype(np.uint8)

    def bundle(hvs):
        """Bundling operation (majority vote)"""
        stacked = np.vstack(hvs)
        counts = np.sum(stacked, axis=0)

        # Majority voting
        threshold = len(hvs) / 2
        return (counts > threshold).astype(np.uint8)

    # Find global min and max for each feature
    min_values = np.min(dataset, axis=0)
    max_values = np.max(dataset, axis=0)

    # Generate feature ID vectors
    feature_id_vectors = [generate_random_hypervector(D) for _ in range(num_features)]

    # Generate interval vectors for each feature
    # We'll use the same quantization levels for all features, but different random vectors
    all_interval_vectors = []
    for feature_idx in range(num_features):
        feature_interval_vectors = [generate_random_hypervector(D) for _ in range(Q)]
        all_interval_vectors.append(feature_interval_vectors)

    # Generate position vectors for N-gram encoding
    position_vectors = [generate_random_hypervector(D) for _ in range(n_gram)]

    # Generate gram vectors
    gram_vectors = [generate_random_hypervector(D) for _ in range(n_gram)]

    # Binarize the dataset
    binarized_samples = []

    for sample_idx in range(num_samples):
        # Get the feature vector for this sample
        sample = dataset[sample_idx]

        if show_details and sample_idx == 0:
            print(f"\nProcessing sample 0: {sample}")

        # Encode each feature value into a hypervector
        feature_hvs = []

        for feature_idx in range(num_features):
            value = sample[feature_idx]
            min_val = min_values[feature_idx]
            max_val = max_values[feature_idx]

            # Skip features with no variation
            if min_val == max_val:
                continue

            # Quantize the value
            step = (max_val - min_val) / Q
            bucket_idx = min(Q - 1, max(0, int((value - min_val) / step)))
            value_hv = all_interval_vectors[feature_idx][bucket_idx]

            # Bind the value HV with the feature ID to create a unique representation
            feature_hv = bind(value_hv, feature_id_vectors[feature_idx])
            feature_hvs.append(feature_hv)

            if show_details and sample_idx == 0 and feature_idx < 3:
                # Only show the first 3 features for the first sample
                print(f"  Feature {feature_idx} value: {value:.4f}, bucket: {bucket_idx}")

        # APPROACH 1: Bundle all feature hypervectors
        # This is simpler but doesn't capture inter-feature relationships
        if num_features <= n_gram:
            # If we have fewer features than n_gram size, just bundle them
            sample_hv = bundle(feature_hvs)
        else:
            # Apply N-gram encoding to capture relationships between adjacent features
            ngram_hvs = []

            for i in range(num_features - n_gram + 1):
                gram_elements = []

                for j in range(n_gram):
                    feature_idx = i + j
                    if feature_idx < len(feature_hvs):  # Check if within bounds
                        hv = feature_hvs[feature_idx]

                        # Bind with position vector
                        pos_bound_hv = bind(hv, position_vectors[j])

                        # Bind with gram vector
                        bound_hv = bind(pos_bound_hv, gram_vectors[j])
                        gram_elements.append(bound_hv)

                # Bundle this N-gram
                if gram_elements:  # Check if we have elements to bundle
                    ngram_hv = bundle(gram_elements)
                    ngram_hvs.append(ngram_hv)

            # Bundle all N-grams to get final representation
            if ngram_hvs:  # Check if we have N-grams to bundle
                sample_hv = bundle(ngram_hvs)
            else:
                # Fallback: just bundle all feature vectors
                sample_hv = bundle(feature_hvs)

        binarized_samples.append(sample_hv)

        if show_details and sample_idx == 0:
            # Count ones in the first sample's binary vector
            ones_count = np.sum(sample_hv)
            print(
                f"  Sample 0 binary vector: {len(sample_hv)} bits, {ones_count} ones ({ones_count / len(sample_hv):.4f})")

    # Stack all sample hypervectors into a single array
    binarized_dataset = np.vstack(binarized_samples)

    if show_details:
        print(f"\nBinarized dataset shape: {binarized_dataset.shape}")

    return binarized_dataset

In [4]:
wind_farm = "B"
train_datasets = [34, 7]
test_datasets = [34, 7, 53, 27, 19, 77, 83, 52, 21, 2, 23, 87, 74, 86, 82]

# [34, 7, 53, 27, 19, 77, 83, 52, 21, 2, 23, 87, 74, 86, 82]

In [5]:
# Create folders data_test and data_train if they do not exist
os.makedirs("data_test", exist_ok=True)
os.makedirs("data_train", exist_ok=True)

In [6]:
exclude_columns = ["time_stamp", "asset_id", "id"]


def load_df_and_annotate_anomalies(farm, event_id):
    path = f"../../../data/care_to_compare/Wind Farm {farm}/datasets/{event_id}.csv"
    df = pd.read_csv(path, delimiter=';')

    event_info = pd.read_csv(f"../../../data/care_to_compare/Wind Farm {farm}/event_info.csv", delimiter=';')

    # Find the row where event_id = event_id
    metadata = event_info[event_info['event_id'] == event_id]

    event_label = metadata["event_label"].values[0]
    event_start_id = metadata["event_start_id"].values[0]
    event_end_id = metadata["event_end_id"].values[0]

    label_value = 1 if event_label == "anomaly" else 0

    # All rows where the column "id" is between event_start_id and event_end_id
    df['label'] = 0
    df.loc[(df['id'] >= event_start_id) & (df['id'] <= event_end_id), 'label'] = label_value

    # Include all columns except for the ones in exclude_columns
    df = df[[col for col in df.columns if col not in exclude_columns]]

    # Replace inf values with NaN and drop rows with NaN values
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.dropna(inplace=True)

    return df

In [7]:
def binarize_dataset_for_training(farm, event_id, output_path):
    # Load original dataset from file
    df = load_df_and_annotate_anomalies(farm, event_id)
    df = df[df['train_test'] == 'train']

    # Take only 1000 rows
    #df = df[:1000]

    df = df[df['status_type_id'].isin([0, 2])]

    # Split into data and labels
    X_values = df.drop(columns=['label', 'train_test', 'status_type_id'])
    X_values = X_values.apply(pd.to_numeric, errors='coerce')

    # Encode the sequence
    final_binary_vector = binarize_dataset_with_hdc(X_values.values, show_details=True)

    # Output to file using np
    np.savetxt(f"{output_path}/X_{farm}_{event_id}.txt", final_binary_vector, fmt='%d')

    label_df = pd.DataFrame({
        'label': df['label'].values,
        'status_type_id': df['status_type_id'].values,
        'train_test': df['train_test'].values
    })

    label_df.to_csv(f"{output_path}/y_{farm}_{event_id}.csv", index=False)

    print(f"Done with {event_id}")

    # Calculate statistics
    number_of_0s = np.sum(final_binary_vector == 0)
    number_of_1s = np.sum(final_binary_vector == 1)

    total_bits = len(final_binary_vector)

    print(f"\nStatistics:")
    print(f"Total bits: {total_bits}")
    print(f"Number of 1s: {number_of_1s}")
    print(f"Number of 0s: {number_of_0s}")


def binarize_dataset_for_testing(farm, event_id, output_path):
    # Load original dataset from file
    df = load_df_and_annotate_anomalies(farm, event_id)

    # Only take the data that is in the prediction set
    df = df[df['train_test'] == 'prediction']

    # Split into data and labels
    X_values = df.drop(columns=['label', 'train_test', 'status_type_id'])
    X_values = X_values.apply(pd.to_numeric, errors='coerce')

    # Encode the sequence
    final_binary_vector = binarize_dataset_with_hdc(X_values.values)

    # Output to file using np
    np.savetxt(f"{output_path}/X_{farm}_{event_id}.txt", final_binary_vector, fmt='%d')

    label_df = pd.DataFrame({
        'label': df['label'].values,
        'status_type_id': df['status_type_id'].values,
        'train_test': df['train_test'].values
    })

    label_df.to_csv(f"{output_path}/y_{farm}_{event_id}.csv", index=False)

    # Calculate statistics
    total_bits = len(final_binary_vector)

    print(f"\nStatistics:")
    print(f"Total rows: {total_bits}")
    print(f"Total columns: {len(final_binary_vector[0])}")


In [8]:
for dataset in train_datasets:
    binarize_dataset_for_training(wind_farm, dataset, "./data_train")

Dataset shape: (43971, 252)
Number of samples: 43971
Number of features: 252

Processing sample 0: [ 9.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  2.89000000e+02  0.00000000e+00  0.00000000e+00  0.00000000e+00
  1.82400000e+01  6.56300000e+01  1.45900000e+01  3.39650000e+02
 -2.97530000e+02  0.00000000e+00  0.00000000e+00  0.00000000e+00
  6.09000000e+00  9.64000000e+00  1.39000000e+00  1.91000000e+00
  1.23900000e+01  1.24100000e+01  0.00000000e+00  1.23800000e+01
  1.74700000e+01  1.75000000e+01  4.00000000e-02  1.74000000e+01
  2.20260000e+02  0.00000000e+00  0.00000000e+00  0.00000000e+00
  2.00000000e-02  3.00000000e-02  1.00000000e-02  0.00000000e+00
 -9.55000000e-03 -7.76935484e-03  8.08064516e-04 -1.17693548e-02
  1.89285000e+04  2.27147000e+04  1.29986000e+03  1.57026000e+04
  1.92353000e+04  1.92454000e+04  3.09000000e+00  1.9229

KeyboardInterrupt: 

In [130]:
for dataset in test_datasets:
    binarize_dataset_for_testing(wind_farm, dataset, "./data_test")


Statistics:
Total rows: 4033
Total columns: 2000

Statistics:
Total rows: 5328
Total columns: 2000

Statistics:
Total rows: 6048
Total columns: 2000

Statistics:
Total rows: 9937
Total columns: 2000

Statistics:
Total rows: 3745
Total columns: 2000

Statistics:
Total rows: 9217
Total columns: 2000

Statistics:
Total rows: 14113
Total columns: 2000

Statistics:
Total rows: 2737
Total columns: 2000

Statistics:
Total rows: 1297
Total columns: 2000

Statistics:
Total rows: 2215
Total columns: 2000

Statistics:
Total rows: 1983
Total columns: 2000

Statistics:
Total rows: 3025
Total columns: 2000

Statistics:
Total rows: 3073
Total columns: 2000

Statistics:
Total rows: 2927
Total columns: 2000

Statistics:
Total rows: 2449
Total columns: 2000
