In [1]:
# Import all required libraries

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from tmu.models.classification.vanilla_classifier import TMClassifier

2025-01-07 18:50:17,713 - tmu.clause_bank.clause_bank_cuda - ERROR - No module named 'pycuda'
Traceback (most recent call last):
  File "C:\Users\kjell\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\tmu\clause_bank\clause_bank_cuda.py", line 41, in <module>
    from pycuda._driver import Device, Context
ModuleNotFoundError: No module named 'pycuda'


In [3]:
exclude_columns = ["time_stamp", "asset_id", "id", "status_type_id"], 

def load_df_and_annotate_anomalies(farm, event_id):
    path = f"../../../data/care_to_compare/Wind Farm {farm}/datasets/{event_id}.csv"
    df = pd.read_csv(path, delimiter=';')
    
    event_info = pd.read_csv(f"../../../data/care_to_compare/Wind Farm {farm}/event_info.csv", delimiter=';')

    # Find the row where event_id = event_id
    metadata = event_info[event_info['event_id'] == event_id]

    event_label = metadata["event_label"].values[0]
    event_start_id = metadata["event_start_id"].values[0]
    event_end_id = metadata["event_end_id"].values[0]

    label_value = 1 if event_label == "anomaly" else 0

    # All rows where the column "id" is between event_start_id and event_end_id
    df['label'] = 0
    df.loc[(df['id'] >= event_start_id) & (df['id'] <= event_end_id), 'label'] = label_value

    # Include all columns except for the ones in exclude_columns
    df = df[[col for col in df.columns if col not in exclude_columns]]
    
    return df

In [4]:
def convert_to_10bit_integers(df):
    normalized_df = df.copy()

    for col in df.columns:
        min_val = df[col].min()
        max_val = df[col].max()

        # Edge case wher all values are 0
        if min_val == max_val:
            normalized_df[col] = 0
        else:
            normalized_df[col] = ((df[col] - min_val) / (max_val - min_val) * 1023)

    # Convert the normalized values to integers
    int_df = normalized_df.astype(int)
    
    # Flatten each row into an array of 10-bit integers
    int_arrays = int_df.apply(lambda row: row.values.flatten(), axis=1).tolist()
    
    # Represent each cell as a 10-bit integer string
    bin_arrays = [[f"{cell:010b}" for cell in row] for row in int_arrays]

    # Split each 10-bit integer string into individual integers for each row
    # preserve the columns of bin_arrays
    bin_int_arrays = [[int(cell) for cell in list(''.join(row))] for row in bin_arrays]

    # Convert to numpy array
    int_arrays = np.array(bin_int_arrays)

    return int_arrays

In [5]:
# Save the model
def load_model(filename) -> TMClassifier:
    with open(filename, "rb") as f:
        model = pickle.load(f)

    return model

In [6]:
tm = load_model("best.pkl")

### CARE score evaluation

In [7]:
def save_eval_metrics(farm, dataset_id, num_anom, num_norm, n_pred_anom, n_pred_norm, acc):
    with open("eval_metrics.csv", "a") as f:
        f.write(f"{farm},{dataset_id},{num_anom},{num_norm},{n_pred_anom},{n_pred_norm},{acc}\n")

In [8]:
def evaluate(farm, dataset_id):
    eval_data = load_df_and_annotate_anomalies(farm, dataset_id)

    X_eval_data = eval_data.drop(columns=['label', 'train_test'])
    y_eval_data = eval_data['label']

    # Remove all columns where the value is not a number
    X_eval_data = X_eval_data.apply(pd.to_numeric, errors='coerce')
    X_eval_data = X_eval_data.dropna(axis=1)

    X_eval = convert_to_10bit_integers(X_eval_data).astype(np.uint32)
    y_eval = y_eval_data.values.astype(np.uint32)

    # Print the number of 0s and 1s in the evaluation data
    eval_0s = np.count_nonzero(y_eval == 0)
    eval_1s = np.count_nonzero(y_eval == 1)

    print(f"Evaluation data: {eval_0s} 0s (normals) and {eval_1s} 1s (anomalies)")

    # g = the ground truth of all data points with a normal status-ID within the prediction time frame
    g = y_eval

    # p = the corresponding prediction of an AD-model.
    p = tm.predict(X_eval)

    print(f"Normals: {np.count_nonzero(g == 1)}")
    print(f"Anomalies: {np.count_nonzero(g == 0)}")

    # Accuracy

    # the number of false positives based on g and p
    fp = np.sum((p == 1) & (g == 0))

    # the number of true negatives based on g and p
    tn = np.sum((p == 0) & (g == 0))

    acc = tn / (fp + tn)    

    print(f"Accuracy = {acc:.5f}, FP = {fp}, TN = {tn}")

    save_eval_metrics(farm, dataset_id, eval_0s, eval_1s, np.count_nonzero(p == 0), np.count_nonzero(p == 1), acc)

In [10]:
# Read all filenames in ../../../data/care_to_compare/Wind Farm C/datasets
filenames = os.listdir("../../../data/care_to_compare/Wind Farm C/datasets")

# Remove the .csv extension
filenames = [filename.split(".")[0] for filename in filenames]


for filename in filenames:
    dataset_id = filename

    print(f"Evaluating C - {dataset_id}")
    evaluate("C", int(dataset_id)) 

Evaluating C - 1
Evaluation data: 53569 0s (normals) and 0 1s (anomalies)
Normals: 0
Anomalies: 53569
Accuracy = 0.99991
Evaluating C - 11
Evaluation data: 53280 0s (normals) and 3157 1s (anomalies)
Normals: 3157
Anomalies: 53280
Accuracy = 0.99668
Evaluating C - 12
Evaluation data: 52848 0s (normals) and 3259 1s (anomalies)
Normals: 3259
Anomalies: 52848
Accuracy = 1.00000
Evaluating C - 15


KeyboardInterrupt: 