In [24]:
# Import all required libraries

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from tmu.models.classification.vanilla_classifier import TMClassifier

In [25]:
include_sensors = ["sensor_0_avg", "sensor_1_avg", "power_2_avg", "sensor_3_avg", "sensor_4_avg", "sensor_9_avg", "power_5_avg", "power_6_avg", "sensor_7_avg", "sensor_8_avg", "sensor_10_avg", "sensor_11_avg"]

exclude_columns = ["time_stamp", "asset_id", "id", "status_type_id"], 

def load_df_and_annotate_anomalies(farm, dataset_id):
    path = f"../../../data/care_to_compare/Wind Farm {farm}/datasets/{dataset_id}.csv"
    df = pd.read_csv(path, delimiter=';')
    
    # If ['status_type_id'] is 0 or 2 (considered normal), then 0, else 1
    df['label'] = df['status_type_id'].apply(lambda x: 0 if x in [0, 2] else 1)
    
    # Include all columns except for the ones in exclude_columns
    df = df[[col for col in df.columns if col not in exclude_columns]]
    
    return df

In [26]:
def convert_to_10bit_integers(df):
    normalized_df = df.copy()

    for col in df.columns:
        min_val = df[col].min()
        max_val = df[col].max()

        # Edge case wher all values are 0
        if min_val == max_val:
            normalized_df[col] = 0
        else:
            normalized_df[col] = ((df[col] - min_val) / (max_val - min_val) * 1023)

    # Convert the normalized values to integers
    int_df = normalized_df.astype(int)
    
    # Flatten each row into an array of 10-bit integers
    int_arrays = int_df.apply(lambda row: row.values.flatten(), axis=1).tolist()
    
    # Represent each cell as a 10-bit integer string
    bin_arrays = [[f"{cell:010b}" for cell in row] for row in int_arrays]

    # Split each 10-bit integer string into individual integers for each row
    # preserve the columns of bin_arrays
    bin_int_arrays = [[int(cell) for cell in list(''.join(row))] for row in bin_arrays]

    # Convert to numpy array
    int_arrays = np.array(bin_int_arrays)

    return int_arrays

In [27]:
# Save the model
def load_model(filename) -> TMClassifier:
    with open(filename, "rb") as f:
        model = pickle.load(f)

    return model

In [28]:
tm = load_model("best.pkl")

### CARE score evaluation

In [29]:
# Filter out all non
eval_df = pd.concat([
    load_df_and_annotate_anomalies('C', 6), 
    #load_df_and_annotate_anomalies('C', 81), 
    #load_df_and_annotate_anomalies('C', 8),
    #load_df_and_annotate_anomalies('C', 85)
])

#eval_data = eval_df[eval_df['train_test'] == 'prediction']
eval_data = eval_df

X_eval_data = eval_data.drop(columns=['label', 'train_test'])
y_eval_data = eval_data['label']

# Remove all columns where the value is not a number
X_eval_data = X_eval_data.apply(pd.to_numeric, errors='coerce')
X_eval_data = X_eval_data.dropna(axis=1)

X_eval = convert_to_10bit_integers(X_eval_data).astype(np.uint32)
y_eval = y_eval_data.values.astype(np.uint32)

# Print dimensions of the integer arrays
print(len(X_eval), len(X_eval[0]))

# Print the size of the evaluation data
print(X_eval.shape)

# Print the number of 0s and 1s in the evaluation data
eval_0s = np.count_nonzero(y_eval == 0)
eval_1s = np.count_nonzero(y_eval == 1)

print(f"Evaluation data: {eval_0s} 0s (normals) and {eval_1s} 1s (anomalies)")

54865 9550
(54865, 9550)
Evaluation data: 46335 0s (normals) and 8530 1s (anomalies)


In [30]:
# g = the ground truth of all data points with a normal status-ID within the prediction time frame
g = y_eval

# p = the corresponding prediction of an AD-model.
p = tm.predict(X_eval)

In [31]:
# Calculate the accuracy

correct = (p == g).sum()
faulty = (p != g).sum()

print(f"Correct: {correct}")
print(f"Faulty: {faulty}")

acc = correct / (correct + faulty) * 100

print(f"Accuracy: {acc:.5f}")

accuracy = 100 * (p == g).mean()

print(f"Accuracy: {accuracy:.5f}")

print(f"Normals: {np.count_nonzero(g == 1)}")
print(f"Anomalies: {np.count_nonzero(g == 0)}")

Correct: 54865
Faulty: 0
Accuracy: 100.00000
Accuracy: 100.00000
Normals: 8530
Anomalies: 46335


In [32]:
# Coverage
# Detection of as many correct anomalies as possible

beta = 0.5

# the number of true positives based on g and p
tp = np.sum((p == 1) & (g == 1))

# the number of false negatives based on g and p
fn = np.sum((p == 0) & (g == 1))

# the number of false positives based on g and p
fp = np.sum((p == 1) & (g == 0))

# the number of true negatives based on g and p
tn = np.sum((p == 0) & (g == 0))

Fbeta = (1 + beta**2) * tp / (1 + beta**2 * tp + beta**2 * fn + fp)

print(f"TP = {tp}, FN = {fn}, FP = {fp}, TN = {tn}, tot = {len(g)}")
print(f"Coverage (F) = {Fbeta:.5f}")

TP = 8530, FN = 0, FP = 0, TN = 46335, tot = 54865
Coverage (F) = 4.99766


In [33]:
# Accuracy
# Recognition of normal behavior

# the number of false positives based on g and p
fp = np.sum((p == 1) & (g == 0))

# the number of true negatives based on g and p
tn = np.sum((p == 0) & (g == 0))

acc = tn / (fp + tn)

print(f"Accuracy = {acc:.5f}")

Accuracy = 1.00000


In [34]:
def calc(N, st, pt):
    print(f"N={N}, len(st)={len(st)}, len(pt)={len(pt)}")
    # Initialize crit as a list of zeros with size N + 1
    crit = [0] * (N + 1)

    # Iterate through the range 1 to N (inclusive)
    for i in range(1, N):
        if st[i] == 0:  # Assuming `st` is a list or array
            if pt[i] == 1:  # Assuming `pt` is a list or array
                crit[i] = crit[i - 1] + 1
            else:
                crit[i] = max(crit[i - 1] - 1, 0)
        else:
            crit[i] = crit[i - 1]

    # Trim crit to include only elements 1 through N
    crit = crit[1:N + 1]
    return crit

def print_results(set, p, g, crit):
    g = ["N" if x == 0 else "A" for x in g]
    p = ["N" if x == 0 else "A" for x in p]
    set = ["A" if x == 0 else "N" for x in set]

    df = pd.DataFrame({'set': set, 'p': p, 'g': g, 'crit': crit})
    df.to_csv("results.csv")

In [35]:
# Reliability
# Few false alarm events

# g[i] = 0 if the status type is 0 or 2, else 1
set = [1 if x == 0 else 0 for x in g]

# Initialize a list of nulls
crit = calc(len(set), set, p)

# Print the set, p and crid in a csv file called "results.csv"
print_results(set, p, g, crit)

crit_max = np.max(crit)

print(f"Max crit: {crit_max}")

tc = 75

# If a value is larger than the threshold, then it is an anomaly (1) else it is not (0)
crit = [1 if c > tc else 0 for c in crit]
crit = np.array(crit)

# the number of true positives based on g and p
tp = np.sum((crit == 1) & (g == 1))

# the number of false negatives based on g and p
fn = np.sum((crit == 0) & (g == 1))

# the number of false positives based on g and p
fp = np.sum((crit == 1) & (g == 0))

# the number of true negatives based on g and p
tn = np.sum((crit == 0) & (g == 0))

beta = 0.5

EFbeta = (1 + beta**2) * tp / (1 + beta**2 * tp + beta**2 * fn + fp)

print(f"TP = {tp}, FN = {fn}, FP = {fp}, TN = {tn}, tot = {len(crit)}")
print(f"Reliability (EFbeta) = {EFbeta:.15f}")

N=54865, len(st)=54865, len(pt)=54865
Max crit: 8530
TP = 8456, FN = 74, FP = 45750, TN = 585, tot = 54865
Reliability (EFbeta) = 0.220744097653680


In [36]:
# Earliness
# Detection of anomalies before fault gets critical.

def calc_weight(sequence_of_anomalies):
    # Normalize the positions of the elements in the sequence between 0 and 1
    normalized_positions = np.linspace(0, 1, len(sequence_of_anomalies))

    # If the normalized position is less than 0.5, then the weight is 1, else it linearly decreases to 0
    weights = [1 if pos < 0.5 else 1 - pos for pos in normalized_positions]

    return np.array(weights).astype(np.float32)

anomalies = []

buffer = []

# Get all subsequences of anomalies of g
for i in range(len(g)):

    # If no anomaly, continue
    if(g[i] == 0 and len(buffer) == 0):
        continue

    if(g[i] == 0 and len(buffer) > 0):
        anomalies.append(buffer)
        buffer = []
        continue

    buffer.append((i, g[i], p[i]))

if len(buffer) > 0:
    anomalies.append(buffer)

In [37]:
wspas = []

for anomaly in anomalies:
    gs = np.array([x[1] for x in anomaly]).astype(np.float32)
    ps = np.array([x[2] for x in anomaly]).astype(np.float32)

    weights = calc_weight(gs)

    gsum = sum(weights * ps) / sum(weights)
    wspas.append(gsum) 

WS = np.mean(wspas)

print(f"Earliness (WS) = {WS:.15f}")

Earliness (WS) = 1.000000000000000


In [38]:
# CARE score calculation

# Arithmetic mean of the Fbeta
F_final = 

SyntaxError: invalid syntax (1915846071.py, line 4)

: 