In [29]:
#%pip install git+https://github.com/cair/tmu.git
#%pip install pycuda

In [8]:
# Import all required libraries

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle


from tmu.models.classification.vanilla_classifier import TMClassifier

In [9]:
include_sensors = ["sensor_0_avg", "sensor_1_avg", "power_2_avg", "sensor_3_avg", "sensor_4_avg", "sensor_9_avg", "power_5_avg", "power_6_avg", "sensor_7_avg", "sensor_8_avg", "sensor_10_avg", "sensor_11_avg"]

exclude_columns = ["time_stamp", "asset_id", "id", "status_type_id"], 

def load_df_and_annotate_anomalies(farm, dataset_id):
    path = f"../../../data/care_to_compare/Wind Farm {farm}/datasets/{dataset_id}.csv"
    df = pd.read_csv(path, delimiter=';')
    
    # If ['status_type_id'] is 0 or 2, then 0, else 1
    df['label'] = df['status_type_id'].apply(lambda x: 0 if x in [0, 2] else 1)
    
    # Include all columns except for the ones in exclude_columns
    df = df[[col for col in df.columns if col not in exclude_columns]]
    
    return df

In [10]:
# Load all required data for training

# 68;anomaly;2015-07-29 13:20:00;52063;2015-08-12 13:10:00;54076;Transformer failure

# Load csv file
df = pd.concat([
    #load_df_and_annotate_anomalies('C', 55), 
    load_df_and_annotate_anomalies('C', 81), 
    #load_df_and_annotate_anomalies('C', 8),
    #load_df_and_annotate_anomalies('C', 85)
])
# Sensors to use

train_data = df[df['train_test'] == 'train']
test_data = df[df['train_test'] == 'prediction']

X_train = train_data.drop(columns=['label', 'train_test'])
y_train = train_data['label']

X_test = test_data.drop(columns=['label', 'train_test'])
y_test = test_data['label']

# Print number of 0s and 1s in the label sets
train_0s = np.count_nonzero(y_train == 0)
train_1s = np.count_nonzero(y_train == 1)

print(f"Train data: {train_0s} 0s and {train_1s} 1s")

test_0s = np.count_nonzero(y_test == 0)
test_1s = np.count_nonzero(y_test == 1)

print(f"Test data: {test_0s} 0s and {test_1s} 1s")

# PRint column names
print(X_train.columns)

Train data: 44510 0s and 8050 1s
Test data: 1188 0s and 184 1s
Index(['time_stamp', 'asset_id', 'id', 'status_type_id', 'sensor_0_avg',
       'sensor_0_max', 'sensor_0_min', 'sensor_0_std', 'sensor_1_avg',
       'sensor_1_max',
       ...
       'wind_speed_236_min', 'wind_speed_236_std', 'wind_speed_235_avg',
       'wind_speed_235_max', 'wind_speed_235_min', 'wind_speed_235_std',
       'wind_speed_237_avg', 'wind_speed_237_max', 'wind_speed_237_min',
       'wind_speed_237_std'],
      dtype='object', length=956)


In [11]:
# Remove all columns where the value is not a number
X_train = X_train.apply(pd.to_numeric, errors='coerce')
X_train = X_train.dropna(axis=1)

X_test = X_test.apply(pd.to_numeric, errors='coerce')
X_test = X_test.dropna(axis=1)

In [13]:
def convert_to_10bit_integers(df):
    normalized_df = df.copy()

    for col in df.columns:
        min_val = df[col].min()
        max_val = df[col].max()

        # Edge case wher all values are 0
        if min_val == max_val:
            normalized_df[col] = 0
        else:
            normalized_df[col] = ((df[col] - min_val) / (max_val - min_val) * 1023)

    # Convert the normalized values to integers
    int_df = normalized_df.astype(int)
    
    # Flatten each row into an array of 10-bit integers
    int_arrays = int_df.apply(lambda row: row.values.flatten(), axis=1).tolist()
    
    # Represent each cell as a 10-bit integer string
    bin_arrays = [[f"{cell:010b}" for cell in row] for row in int_arrays]

    # Split each 10-bit integer string into individual integers for each row
    # preserve the columns of bin_arrays
    bin_int_arrays = [[int(cell) for cell in list(''.join(row))] for row in bin_arrays]

    # Convert to numpy array
    int_arrays = np.array(bin_int_arrays)

    return int_arrays

# Example usage
X_train_binarized = convert_to_10bit_integers(X_train).astype(np.uint32)
X_test_binarized = convert_to_10bit_integers(X_test).astype(np.uint32)

y_train_binarized = y_train.values.astype(np.uint32)
y_test_binarized = y_test.values.astype(np.uint32)

# Print dimensions of the integer arrays
print(len(X_train_binarized), len(X_train_binarized[0]))

52560 9550


In [14]:
print(X_train_binarized.shape)
print(y_train_binarized.shape)

print(X_test_binarized.shape)    
print(y_test_binarized.shape)

(52560, 9550)
(52560,)
(1372, 9550)
(1372,)


In [15]:
# Print each row into a file data.txt
def write_to_file(X, y, filename):
    with open(filename, "w") as f:
        for i in range(len(X)):
            f.write(" ".join([str(x) for x in X[i]]) + " " + str(y[i]) + "\n")

write_to_file(X_train_binarized, y_train_binarized, "data_train_exp_1.txt")
write_to_file(X_test_binarized, y_test_binarized, "data_test_exp_1.txt")

KeyboardInterrupt: 

In [16]:
# Save the model

import pickle

def save_model(model, filename):
    with open(filename, "wb") as f:
        pickle.dump(model, f)

In [18]:
epochs = 10

tm = TMClassifier(
    number_of_clauses=1000,
    T=1000,
    s=10.0,
    max_included_literals=32,
    weighted_clauses=True,
    platform="CPU",
    batch_size=1000,
)

best_accuracy = 0

print(f"Running {TMClassifier} for {epochs} epochs")

for epoch in range(epochs):
    tm.fit(X_train_binarized, y_train_binarized)

    print("Finished fitting")
    
    pred = tm.predict(X_test_binarized)

    pred_0s = np.count_nonzero(pred == 0)
    pred_1s = np.count_nonzero(pred == 1)

    print(f"Predicted 0s: {pred_0s}, Predicted 1s: {pred_1s}")
  
    result = 100* (pred == y_test_binarized).mean()

    # Print every 20 epochs
    #if (epoch + 1) % 20 == 0:
    print(f"Epoch: {epoch + 1}, Accuracy: {result:.5f}")

    if result > best_accuracy:
        best_accuracy = result

        print("Saving model")
        save_model(tm, "best.pkl")


Running <class 'tmu.models.classification.vanilla_classifier.TMClassifier'> for 10 epochs
Finished fitting
Predicted 0s: 1142, Predicted 1s: 230
Epoch: 1, Accuracy: 96.64723
Saving model
Finished fitting
Predicted 0s: 1158, Predicted 1s: 214
Epoch: 2, Accuracy: 97.81341
Saving model
Finished fitting
Predicted 0s: 1155, Predicted 1s: 217
Epoch: 3, Accuracy: 97.59475
Finished fitting
Predicted 0s: 1168, Predicted 1s: 204
Epoch: 4, Accuracy: 98.54227
Saving model
Finished fitting
Predicted 0s: 1168, Predicted 1s: 204
Epoch: 5, Accuracy: 98.54227
Finished fitting
Predicted 0s: 1168, Predicted 1s: 204
Epoch: 6, Accuracy: 98.54227
Finished fitting
Predicted 0s: 1168, Predicted 1s: 204
Epoch: 7, Accuracy: 98.54227
Finished fitting
Predicted 0s: 1169, Predicted 1s: 203
Epoch: 8, Accuracy: 98.61516
Saving model
Finished fitting
Predicted 0s: 1169, Predicted 1s: 203
Epoch: 9, Accuracy: 98.61516
Finished fitting
Predicted 0s: 1169, Predicted 1s: 203
Epoch: 10, Accuracy: 98.61516


### CARE score evaluation

In [28]:
# Filter out all non
eval_df = pd.concat([
    load_df_and_annotate_anomalies('C', 55), 
    load_df_and_annotate_anomalies('C', 81), 
    load_df_and_annotate_anomalies('C', 8),
    load_df_and_annotate_anomalies('C', 85)
])

eval_data = eval_df[eval_df['train_test'] == 'prediction']

X_eval_data = eval_data.drop(columns=['label', 'train_test'])
y_eval_data = eval_data['label']

# Remove all columns where the value is not a number
X_eval_data = X_eval_data.apply(pd.to_numeric, errors='coerce')
X_eval_data = X_eval_data.dropna(axis=1)

X_eval = convert_to_10bit_integers(X_eval_data).astype(np.uint32)
y_eval = y_eval_data.values.astype(np.uint32)

# Print dimensions of the integer arrays
print(len(X_eval), len(X_eval[0]))

# Print the size of the evaluation data
print(X_eval.shape)

# Print the number of 0s and 1s in the evaluation data
eval_0s = np.count_nonzero(y_eval == 0)
eval_1s = np.count_nonzero(y_eval == 1)

print(f"Evaluation data: {eval_0s} 0s and {eval_1s} 1s")

8392 9550
(8392, 9550)
Evaluation data: 8051 0s and 341 1s


In [29]:
# g = the ground truth of all data points with a normal status-ID within the prediction time frame
g = y_eval

# p = the corresponding prediction of an AD-model.
p = tm.predict(X_eval)

# Calculate the accuracy
accuracy = 100 * (p == g).mean()

print(f"Accuracy: {accuracy:.5f}")

print(f"Positives: {np.count_nonzero(g == 1)}")
print(f"Negatives: {np.count_nonzero(g == 0)}")

Accuracy: 99.72593
Positives: 341
Negatives: 8051


In [31]:
# Coverage
# Detection of as many correct anomalies as possible

beta = 0.5

# the number of true positives based on g and p
tp = np.sum((p == 1) & (g == 1))

# the number of false negatives based on g and p
fn = np.sum((p == 0) & (g == 1))

# the number of false positives based on g and p
fp = np.sum((p == 1) & (g == 0))

tn = np.sum((p == 0) & (g == 0))

Fbeta = (1 + beta**2) * tp / (1 + beta**2 * tp + beta**2 * fn + fp)

print(f"TP = {tp}, FN = {fn}, FP = {fp}, TN = {tn}, tot = {len(g)}")
print(f"F{beta} = {Fbeta:.5f}")

TP = 341, FN = 0, FP = 23, TN = 8028, tot = 8392
F0.5 = 3.90160


In [None]:
# Accuracy
# Recognition of normal behavior

In [None]:
# Reliability
# Few false alarm events

In [None]:
# Earliness
# Detection of anomalies before fault gets critical.