In [29]:
#%pip install git+https://github.com/cair/tmu.git
#%pip install pycuda

In [2]:
# Import all required libraries

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from tmu.models.classification.vanilla_classifier import TMClassifier

In [3]:
include_sensors = ["sensor_0_avg", "sensor_1_avg", "power_2_avg", "sensor_3_avg", "sensor_4_avg", "sensor_9_avg", "power_5_avg", "power_6_avg", "sensor_7_avg", "sensor_8_avg", "sensor_10_avg", "sensor_11_avg"]

def load_df_and_annotate_anomalies(farm, dataset_id):
    path = f"../../data/care_to_compare/Wind Farm {farm}/datasets/{dataset_id}.csv"
    df = pd.read_csv(path, delimiter=';')
    
    # If ['status_type_id'] is 0 or 2, then 0, else 1
    df['label'] = df['status_type_id'].apply(lambda x: 0 if x in [0, 2] else 1)
    
    # Drop all columns except the ones in include_sensors
    df = df[include_sensors + ['label', 'train_test']]
    
    return df

In [16]:
# Load all required data for training

# 68;anomaly;2015-07-29 13:20:00;52063;2015-08-12 13:10:00;54076;Transformer failure

# Load csv file
df = pd.concat([
    load_df_and_annotate_anomalies('C', 55), 
    load_df_and_annotate_anomalies('C', 81), 
    load_df_and_annotate_anomalies('C', 8),
    load_df_and_annotate_anomalies('C', 85)
])
# Sensors to use

train_data = df[df['train_test'] == 'train']
test_data = df[df['train_test'] == 'prediction']

X_train = train_data.drop(columns=['label', 'train_test'])
y_train = train_data['label']

X_test = test_data.drop(columns=['label', 'train_test'])
y_test = test_data['label']

In [17]:
# Remove all columns where the value is not a number
X_train = X_train.apply(pd.to_numeric, errors='coerce')
X_train = X_train.dropna(axis=1)

X_test = X_test.apply(pd.to_numeric, errors='coerce')
X_test = X_test.dropna(axis=1)

In [18]:
# Make sure that there are a equal numbers of rows of 0s and 1s
n = min(y_train.value_counts())
X_train = pd.concat([X_train[y_train == 0].sample(n=n), X_train[y_train == 1].sample(n=n)])
y_train = pd.concat([y_train[y_train == 0].sample(n=n), y_train[y_train == 1].sample(n=n)])

In [20]:
# Print number of 0s and 1s in the label sets
train_0s = np.count_nonzero(y_train == 0)
train_1s = np.count_nonzero(y_train == 1)

print(f"Train data: {train_0s} 0s and {train_1s} 1s")

test_0s = np.count_nonzero(y_test == 0)
test_1s = np.count_nonzero(y_test == 1)

print(f"Test data: {test_0s} 0s and {test_1s} 1s")

# PRint column names
print(X_train.columns)

Train data: 20899 0s and 20899 1s
Test data: 8051 0s and 341 1s
Index(['sensor_0_avg', 'sensor_1_avg', 'power_2_avg', 'sensor_3_avg',
       'sensor_4_avg', 'sensor_9_avg', 'power_5_avg', 'power_6_avg',
       'sensor_7_avg', 'sensor_8_avg', 'sensor_10_avg', 'sensor_11_avg'],
      dtype='object')


In [21]:
def convert_to_10bit_integers(df):
    normalized_df = df.copy()

    for col in df.columns:
        min_val = df[col].min()
        max_val = df[col].max()

        # Edge case wher all values are 0
        if min_val == max_val:
            normalized_df[col] = 0
        else:
            normalized_df[col] = ((df[col] - min_val) / (max_val - min_val) * 1023)

    # Convert the normalized values to integers
    int_df = normalized_df.astype(int)
    
    # Flatten each row into an array of 10-bit integers
    int_arrays = int_df.apply(lambda row: row.values.flatten(), axis=1).tolist()
    
    # Represent each cell as a 10-bit integer string
    bin_arrays = [[f"{cell:010b}" for cell in row] for row in int_arrays]

    # Split each 10-bit integer string into individual integers for each row
    # preserve the columns of bin_arrays
    bin_int_arrays = [[int(cell) for cell in list(''.join(row))] for row in bin_arrays]

    # Convert to numpy array
    int_arrays = np.array(bin_int_arrays)

    return int_arrays

# Example usage
X_train_binarized = convert_to_10bit_integers(X_train).astype(np.uint32)
X_test_binarized = convert_to_10bit_integers(X_test).astype(np.uint32)

y_train_binarized = y_train.values.astype(np.uint32)
y_test_binarized = y_test.values.astype(np.uint32)

# Print dimensions of the integer arrays
print(len(X_train_binarized), len(X_train_binarized[0]))

41798 120


In [22]:
print(X_train_binarized.shape)
print(y_train_binarized.shape)

print(X_test_binarized.shape)    
print(y_test_binarized.shape)

(41798, 120)
(41798,)
(8392, 120)
(8392,)


In [23]:
# Print each row into a file data.txt
def write_to_file(X, y, filename):
    with open(filename, "w") as f:
        for i in range(len(X)):
            f.write(" ".join([str(x) for x in X[i]]) + " " + str(y[i]) + "\n")

write_to_file(X_train_binarized, y_train_binarized, "data_train_exp_1.txt")
write_to_file(X_test_binarized, y_test_binarized, "data_test_exp_1.txt")

In [25]:
epochs = 30

tm = TMClassifier(
    number_of_clauses=200,
    T=200,
    s=10.0,
    max_included_literals=32,
    weighted_clauses=True,
    platform="CPU",
    batch_size=1000,
)

print(f"Running {TMClassifier} for {epochs} epochs")

for epoch in range(epochs):
    tm.fit(X_train_binarized, y_train_binarized)

    print("Finished fitting")
    
    pred = tm.predict(X_test_binarized)

    pred_0s = np.count_nonzero(pred == 0)
    pred_1s = np.count_nonzero(pred == 1)

    print(f"Predicted 0s: {pred_0s}, Predicted 1s: {pred_1s}")
  
    result = 100* (pred == y_test_binarized).mean()

    # Print every 20 epochs
    #if (epoch + 1) % 20 == 0:
    print(f"Epoch: {epoch + 1}, Accuracy: {result:.5f}")


Running <class 'tmu.models.classification.vanilla_classifier.TMClassifier'> for 30 epochs
Finished fitting
Predicted 0s: 4248, Predicted 1s: 4144
Epoch: 1, Accuracy: 54.15872
Finished fitting
Predicted 0s: 3715, Predicted 1s: 4677
Epoch: 2, Accuracy: 48.14109
Finished fitting
Predicted 0s: 2614, Predicted 1s: 5778
Epoch: 3, Accuracy: 34.87846
Finished fitting
Predicted 0s: 257, Predicted 1s: 8135
Epoch: 4, Accuracy: 7.05434
Finished fitting
Predicted 0s: 1179, Predicted 1s: 7213
Epoch: 5, Accuracy: 17.96949
Finished fitting
Predicted 0s: 2172, Predicted 1s: 6220
Epoch: 6, Accuracy: 29.80219
Finished fitting
Predicted 0s: 393, Predicted 1s: 7999
Epoch: 7, Accuracy: 8.67493
Finished fitting
Predicted 0s: 1586, Predicted 1s: 6806
Epoch: 8, Accuracy: 22.81935
Finished fitting
Predicted 0s: 1421, Predicted 1s: 6971
Epoch: 9, Accuracy: 20.87703
Finished fitting
Predicted 0s: 418, Predicted 1s: 7974
Epoch: 10, Accuracy: 8.94900
Finished fitting
Predicted 0s: 2697, Predicted 1s: 5695
Epoch: 11

In [26]:
# Save the model

import pickle

with open("model.pickle", "wb") as f:
    pickle.dump(tm, f)

### CARE score evaluation

In [30]:
# Filter out all non
eval_df = pd.concat([
    load_df_and_annotate_anomalies('C', 55), 
    load_df_and_annotate_anomalies('C', 81), 
    load_df_and_annotate_anomalies('C', 8),
    load_df_and_annotate_anomalies('C', 85)
])

eval_data = eval_df[eval_df['train_test'] == 'prediction']

X_eval_data = eval_data.drop(columns=['label', 'train_test'])
y_eval_data = eval_data['label']

X_eval = convert_to_10bit_integers(X_eval_data).astype(np.uint32)
y_eval = y_eval_data.values.astype(np.uint32)

# Print dimensions of the integer arrays
print(len(X_eval), len(X_eval[0]))

# Print the size of the evaluation data
print(X_eval.shape)

8392 120
(8392, 120)


In [31]:
# g = the ground truth of all data points with a normal status-ID within the prediction time frame
g = y_eval

# p = the corresponding prediction of an AD-model.
p = tm.predict(X_eval)

# Calculate the accuracy
accuracy = 100 * (p == g).mean()

print(f"Accuracy: {accuracy:.5f}")

print(g)
print(p)

Accuracy: 28.46759
[0 0 0 ... 1 1 1]
[0 0 0 ... 1 1 1]


In [32]:
# Coverage
# Detection of as many correct anomalies as possible

beta = 0.5

# the number of true positives based on g and p
tp = np.sum((p == 1) & (g == 1))

# the number of false negatives based on g and p
fn = np.sum((p == 0) & (g == 1))

# the number of false positives based on g and p
fp = np.sum((p == 1) & (g == 0))

tn = np.sum((p == 0) & (g == 0))

Fbeta = (1 + beta**2) * tp / (1 + beta**2 * tp + beta**2 * fn + fp)

print(f"TP = {tp}, FN = {fn}, FP = {fp}, TN = {tn}, tot = {len(g)}")
print(f"F{beta} = {Fbeta:.10f}")

TP = 338, FN = 3, FP = 6000, TN = 2051, tot = 8392
F0.5 = 0.0694187718


In [None]:
# Accuracy
# Recognition of normal behavior

In [None]:
# Reliability
# Few false alarm events

In [None]:
# Earliness
# Detection of anomalies before fault gets critical.