## Experiment description

Train a TM Classifier to classify anomalies in CARE to comapre SCADA dataset.
Train on a subset of the datasets, then evaluate the model on all datasets in Wind Farm C.

Use the accuracy score as a simple metric to measure the performace.

In [1]:
#%pip install git+https://github.com/cair/tmu.git
#%pip install pycuda

In [2]:
# Import all required libraries

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle


from tmu.models.classification.vanilla_classifier import TMClassifier

2025-01-06 17:40:43,694 - tmu.clause_bank.clause_bank_cuda - ERROR - No module named 'pycuda'
Traceback (most recent call last):
  File "C:\Users\kjell\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\tmu\clause_bank\clause_bank_cuda.py", line 41, in <module>
    from pycuda._driver import Device, Context
ModuleNotFoundError: No module named 'pycuda'


In [3]:
exclude_columns = ["time_stamp", "asset_id", "id", "status_type_id"], 

def load_df_and_annotate_anomalies(farm, dataset_id):
    path = f"../../../data/care_to_compare/Wind Farm {farm}/datasets/{dataset_id}.csv"
    df = pd.read_csv(path, delimiter=';')
    
    # If ['status_type_id'] is 0 or 2, then 0, else 1
    df['label'] = df['status_type_id'].apply(lambda x: 0 if x in [0, 2] else 1)
    
    # Include all columns except for the ones in exclude_columns
    df = df[[col for col in df.columns if col not in exclude_columns]]
    
    return df

In [5]:
# Load all required data for training

# 68;anomaly;2015-07-29 13:20:00;52063;2015-08-12 13:10:00;54076;Transformer failure

# Load csv file
df = pd.concat([
    #load_df_and_annotate_anomalies('C', 55), 
    load_df_and_annotate_anomalies('C', 81), 
    #load_df_and_annotate_anomalies('C', 8),
    #load_df_and_annotate_anomalies('C', 85)
])
# Sensors to use

train_data = df[df['train_test'] == 'train']
test_data = df[df['train_test'] == 'prediction']

X_train = train_data.drop(columns=['label', 'train_test'])
y_train = train_data['label']

X_test = test_data.drop(columns=['label', 'train_test'])
y_test = test_data['label']

# Print number of 0s and 1s in the label sets
train_0s = np.count_nonzero(y_train == 0)
train_1s = np.count_nonzero(y_train == 1)

print(f"Train data: {train_0s} 0s and {train_1s} 1s")

test_0s = np.count_nonzero(y_test == 0)
test_1s = np.count_nonzero(y_test == 1)

print(f"Test data: {test_0s} 0s and {test_1s} 1s")

Train data: 44510 0s and 8050 1s
Test data: 1188 0s and 184 1s


In [6]:
# Remove all columns where the value is not a number
X_train = X_train.apply(pd.to_numeric, errors='coerce')
X_train = X_train.dropna(axis=1)

X_test = X_test.apply(pd.to_numeric, errors='coerce')
X_test = X_test.dropna(axis=1)

In [7]:
def convert_to_10bit_integers(df):
    normalized_df = df.copy()

    for col in df.columns:
        min_val = df[col].min()
        max_val = df[col].max()

        # Edge case wher all values are 0
        if min_val == max_val:
            normalized_df[col] = 0
        else:
            normalized_df[col] = ((df[col] - min_val) / (max_val - min_val) * 1023)

    # Convert the normalized values to integers
    int_df = normalized_df.astype(int)
    
    # Flatten each row into an array of 10-bit integers
    int_arrays = int_df.apply(lambda row: row.values.flatten(), axis=1).tolist()
    
    # Represent each cell as a 10-bit integer string
    bin_arrays = [[f"{cell:010b}" for cell in row] for row in int_arrays]

    # Split each 10-bit integer string into individual integers for each row
    # preserve the columns of bin_arrays
    bin_int_arrays = [[int(cell) for cell in list(''.join(row))] for row in bin_arrays]

    # Convert to numpy array
    int_arrays = np.array(bin_int_arrays)

    return int_arrays

# Example usage
X_train_binarized = convert_to_10bit_integers(X_train).astype(np.uint32)
X_test_binarized = convert_to_10bit_integers(X_test).astype(np.uint32)

y_train_binarized = y_train.values.astype(np.uint32)
y_test_binarized = y_test.values.astype(np.uint32)

# Print dimensions of the integer arrays
print(len(X_train_binarized), len(X_train_binarized[0]))

52560 9550


In [8]:
print(X_train_binarized.shape)
print(y_train_binarized.shape)
print(X_test_binarized.shape)    
print(y_test_binarized.shape)

(52560, 9550)
(52560,)
(1372, 9550)
(1372,)


In [12]:
# Save the model
def save_model(model, filename):
    with open(filename, "wb") as f:
        pickle.dump(model, f)

def save_accuracy(epoch, accuracy, filename):
    with open(filename, "a") as f:
        f.write(f"{epoch},{accuracy}\n")

In [13]:
epochs = 10

tm = TMClassifier(
    number_of_clauses=1000,
    T=1000,
    s=10.0,
    max_included_literals=32,
    weighted_clauses=True,
    platform="CPU",
    batch_size=1000,
)

best_accuracy = 0

print(f"Running {TMClassifier} for {epochs} epochs")

for epoch in range(epochs):
    tm.fit(X_train_binarized, y_train_binarized)

    print("Finished fitting")
    
    pred = tm.predict(X_test_binarized)

    pred_0s = np.count_nonzero(pred == 0)
    pred_1s = np.count_nonzero(pred == 1)

    print(f"Predicted 0s: {pred_0s}, Predicted 1s: {pred_1s}")
  
    result = 100* (pred == y_test_binarized).mean()

    save_accuracy(epoch, result, "accuracy.txt")

    # Print every 20 epochs
    #if (epoch + 1) % 20 == 0:
    print(f"Epoch: {epoch + 1}, Accuracy: {result:.5f}")

    if result > best_accuracy:
        best_accuracy = result

        print("Saving model")
        save_model(tm, "best.pkl")


Running <class 'tmu.models.classification.vanilla_classifier.TMClassifier'> for 10 epochs
Finished fitting
Predicted 0s: 1087, Predicted 1s: 285
Epoch: 1, Accuracy: 92.63848
Saving model
Finished fitting
Predicted 0s: 1129, Predicted 1s: 243
Epoch: 2, Accuracy: 95.69971
Saving model
Finished fitting
Predicted 0s: 1164, Predicted 1s: 208
Epoch: 3, Accuracy: 98.25073
Saving model
Finished fitting
Predicted 0s: 1163, Predicted 1s: 209
Epoch: 4, Accuracy: 98.17784
Finished fitting
Predicted 0s: 1168, Predicted 1s: 204
Epoch: 5, Accuracy: 98.54227
Saving model
Finished fitting
Predicted 0s: 1167, Predicted 1s: 205
Epoch: 6, Accuracy: 98.46939
Finished fitting
Predicted 0s: 1169, Predicted 1s: 203
Epoch: 7, Accuracy: 98.61516
Saving model
Finished fitting
Predicted 0s: 1168, Predicted 1s: 204
Epoch: 8, Accuracy: 98.54227
Finished fitting
Predicted 0s: 1168, Predicted 1s: 204
Epoch: 9, Accuracy: 98.54227
Finished fitting
Predicted 0s: 1168, Predicted 1s: 204
Epoch: 10, Accuracy: 98.54227
