In [541]:
import matplotlib.pyplot as plt

plt.style.available

['Solarize_Light2',
 '_classic_test_patch',
 '_mpl-gallery',
 '_mpl-gallery-nogrid',
 'bmh',
 'classic',
 'dark_background',
 'fast',
 'fivethirtyeight',
 'ggplot',
 'grayscale',
 'seaborn-v0_8',
 'seaborn-v0_8-bright',
 'seaborn-v0_8-colorblind',
 'seaborn-v0_8-dark',
 'seaborn-v0_8-dark-palette',
 'seaborn-v0_8-darkgrid',
 'seaborn-v0_8-deep',
 'seaborn-v0_8-muted',
 'seaborn-v0_8-notebook',
 'seaborn-v0_8-paper',
 'seaborn-v0_8-pastel',
 'seaborn-v0_8-poster',
 'seaborn-v0_8-talk',
 'seaborn-v0_8-ticks',
 'seaborn-v0_8-white',
 'seaborn-v0_8-whitegrid',
 'tableau-colorblind10']

In [542]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import matplotx
import pandas as pd
import os

import pickle

from tmu.models.autoencoder.autoencoder import TMAutoEncoder
from src.lib.care import calculate_care_score

# use everywhere:
plt.style.use("seaborn-v0_8")

In [543]:
# Create folder figures if it does not exist
os.makedirs("figures", exist_ok=True)
os.makedirs("predictions", exist_ok=True)

In [544]:
def load_test_dataset(farm, event_id):
    X = np.loadtxt(f"./data_test/X_{farm}_{event_id}.txt", dtype=np.uint32)
    X = np.array(X).astype(np.uint32)
    return X


def load_thresh_dataset(farm, event_id):
    X = np.loadtxt(f"./data_test/X_{farm}_{event_id}.txt", dtype=np.uint32)
    X = np.array(X).astype(np.uint32)

    # Take the first 5000 rows
    X = X[:10000]
    return X


def load_test_labels(farm, event_id):
    # Load dataframe from file
    df = pd.read_csv(f"./data_test/y_{farm}_{event_id}.csv")

    labels = df['label'].values
    status_ids = df['status_type_id'].values
    train_test = df['train_test'].values

    return np.array(labels).astype(np.uint32), np.array(status_ids).astype(np.uint32), train_test


def load_test_label(farm, event_id):
    event_info = pd.read_csv(f"../../../data/care_to_compare/Wind Farm {farm}/event_info.csv", delimiter=';')

    metadata = event_info[event_info['event_id'] == event_id]

    event_label = metadata["event_label"].values[0]

    return False if event_label == "anomaly" else True

In [545]:
def calculate_accuracy(labels, predictions):
    # Calculate the accuracy
    accuracy = np.sum(labels == predictions) / len(labels)

    return accuracy


def calculate_threshold(X, pred):
    losses = [hamming_loss(X[i], pred[i]) for i in range(len(X))]

    # Set the threshold as the lowest 1% of the losses
    threshold = np.percentile(losses, 90)

    return threshold


In [546]:
def load_model(filename) -> TMAutoEncoder:
    with open(filename, "rb") as f:
        model = pickle.load(f)

    return model


def save_predictions(predictions, filename):
    with open(filename, "wb") as f:
        pickle.dump(predictions, f)


def load_predictions(filename):
    with open(filename, "rb") as f:
        predictions = pickle.load(f)

    return predictions



In [547]:
def hamming_loss(pred, X_test):
    """
    Computes the Hamming loss between predicted and ground truth binary arrays.

    Parameters:
    - pred (numpy array): Binary predictions of shape (n_samples, n_bits).
    - X_test (numpy array): Ground truth binary values of shape (n_samples, n_bits).

    Returns:
    - float: Hamming loss (fraction of incorrect bits).
    """
    assert pred.shape == X_test.shape, "Shapes of pred and X_test must match"

    # Compute the number of differing bits
    incorrect_bits = np.sum(pred != X_test)

    # Total number of bits
    total_bits = np.prod(X_test.shape)

    # Hamming loss is the fraction of incorrect bits
    return incorrect_bits / total_bits

In [548]:
def reconstruction_accuracy(X, pred):
    correct = np.sum(X == pred)
    accuracy = correct / len(X)
    return accuracy


def plot_mse(X, y, pred, name, threshold):
    # Compute MSE for each row
    mse_per_row = [hamming_loss(X[i], pred[i]) for i in range(len(X))]

    # Plot SNS plot of all MSE values
    plt.figure(figsize=(8, 4))
    sns.histplot(mse_per_row, bins=50, kde=True, color='b')

    # Add a threshold line
    plt.axvline(threshold, color='r', linestyle='--')

    # Save the plot
    plt.savefig(f"./figures/plot_reconstruction_acc_{name}.png")

    plt.close()


def plot_predictions(X, y, z, pred, p, name, plot_name, threshold):
    x = np.arange(0, len(X))  # Time or index
    r = [hamming_loss(X[i], pred[i]) for i in range(len(X))]

    if np.sum(y) < 1:
        start_idx = 0
        end_idx = 0
    else:
        start_idx = np.where(y == 1)[0][0]
        end_idx = np.where(y == 1)[0][-1]

    # Create a figure with two subplots (1 row, 2 columns)
    fig, axes = plt.subplots(figsize=(12, 10))

    # Filter data points into normal and anomaly sets
    normal_indices = [i for i in range(len(p)) if p[i] == 0]
    anomaly_indices = [i for i in range(len(p)) if p[i] == 1]

    # Get corresponding x values and reconstruction loss values for each set
    normal_x = [x[i] for i in normal_indices]
    normal_r = [r[i] for i in normal_indices]

    anomaly_x = [x[i] for i in anomaly_indices]
    anomaly_r = [r[i] for i in anomaly_indices]

    # Plot the two scatter plots separately
    axes.axvspan(start_idx, end_idx, alpha=0.2, color='#d08770', label="Region with anomalies")
    axes.scatter(normal_x, normal_r, color='#81a1c1', label="Predicted normal", alpha=0.7)
    axes.scatter(anomaly_x, anomaly_r, color='#d08770', label="Predicted anomaly", alpha=0.7)
    axes.axhline(threshold, color='#5e81ac', linestyle='--', label="Threshold")

    axes.set_title(plot_name)
    axes.set_xlabel("Time\nSampled at ten minute intervals")
    axes.set_ylabel("Reconstruction Loss\n(Hamming Loss)")
    axes.legend()

    # Adjust layout
    plt.tight_layout()
    plt.grid(True)

    # plt.show()

    # Save the plot
    plt.savefig(f"./figures/plot_detections_{name}.pdf")

    plt.close()


def get_predictions(X, y, z, name, plot_name, tm, threshold, load_pred=False):
    if load_pred:
        pred = load_predictions(f"./predictions/predictions_{name}.pkl")
    else:
        pred = tm.predict(X)

    # Save predictions
    save_predictions(pred, f"./predictions/predictions_{name}.pkl")

    # For each row in pred, if the MSE is greater than the threshold, then it is an anomaly
    losses = [hamming_loss(X[i], pred[i]) for i in range(len(X))]

    X_predictions = np.array([1 if losses[i] > threshold else 0 for i in range(len(X))])

    # Accuracy
    accuracy = calculate_accuracy(y, X_predictions)

    plot_mse(X, y, pred, name, threshold)
    plot_predictions(X, y, z, pred, X_predictions, name, plot_name, threshold)

    return X_predictions, accuracy

In [549]:
test_datasets = [47, 12, 4, 18, 28, 39, 66, 15, 78, 79, 30, 33, 11, 44, 49, 31, 67, 9, 91, 5, 90, 35, 16,
                 76, 8, 85, 6, 62, 36, 56, 94, 54, 43, 50, 64, 46, 65, 61, 93, 75, 41, 58, 48, 88, 57, 32, 89, 59, 63,
                 80, 37, 29, 1, 20, 60]

# 18, 28, 39, 66, 15, 78, 79, 30, 33, 11, 44, 49, 31, 67, 9, 91, 5, 90, 70, 35, 16, 76, 8, 85, 6, 62, 36, 56, 94, 54, 43, 50, 64, 46, 65, 61, 93, 75, 41, 58, 48, 88, 57, 32, 89, 59, 63, 80, 37, 29,
tm_autoencoder = load_model("models/latest_2.pkl")

#X_thresh = load_thresh_dataset("C", 12)
#X_thresh_pred = tm_autoencoder.predict(X_thresh)
#threshold = calculate_threshold(X_thresh, X_thresh_pred)

threshold = 0.531

print(f"Threshold: {threshold}")


def run_prediction(farm, dataset, load_pred=False):
    X = load_test_dataset(farm, dataset)
    labels, status_ids, train_test = load_test_labels(farm, dataset)

    is_normal = load_test_label(farm, dataset)

    predictions, accuracy = get_predictions(X, labels, status_ids, f"{farm}_{dataset}",
                                            f"Anomaly prediction using Tsetlin Autoencoder\nWind farm {farm} - Dataset {dataset}",
                                            tm_autoencoder, threshold,
                                            load_pred)

    return X, labels, status_ids, train_test, is_normal, predictions, accuracy


Threshold: 0.531


In [550]:
# Create a dataframe with status_type_id;label;prediction
elements = []

lp = True

for set in test_datasets:
    res = run_prediction("C", set, load_pred=lp)

    result_df = pd.DataFrame({
        'status_type_id': res[2],
        'label': res[1],
        'prediction': res[5],
        'train_test': res[3],
    })

    print(f"Done with {set}. Accuracy: {res[6]}")

    elements.append({'dataset': set, 'normal': res[4], 'data': result_df, 'accuracy': res[6]})

2025-05-03 18:01:01,462 - matplotlib.backends.backend_pdf - DEBUG - Assigning font /F1 = '/System/Library/Fonts/Supplemental/Arial.ttf'
2025-05-03 18:01:01,549 - matplotlib.backends.backend_pdf - DEBUG - Embedding font /System/Library/Fonts/Supplemental/Arial.ttf.
2025-05-03 18:01:01,550 - matplotlib.backends.backend_pdf - DEBUG - Writing TrueType font.
Done with 47. Accuracy: 0.5478757133798351
2025-05-03 18:01:02,422 - matplotlib.backends.backend_pdf - DEBUG - Assigning font /F1 = '/System/Library/Fonts/Supplemental/Arial.ttf'
2025-05-03 18:01:02,495 - matplotlib.backends.backend_pdf - DEBUG - Embedding font /System/Library/Fonts/Supplemental/Arial.ttf.
2025-05-03 18:01:02,496 - matplotlib.backends.backend_pdf - DEBUG - Writing TrueType font.
Done with 12. Accuracy: 0.3433887792500705
2025-05-03 18:01:03,326 - matplotlib.backends.backend_pdf - DEBUG - Assigning font /F1 = '/System/Library/Fonts/Supplemental/Arial.ttf'
2025-05-03 18:01:03,382 - matplotlib.backends.backend_pdf - DEBUG 

In [551]:
import src.lib.care as care

score = care.calculate_care_score(elements)
print(score)

0.6897384409275394


In [552]:
# To beat 0.688609158102485 (0.531 threshold)