## Experiment description

Train a TM Classifier to classify anomalies in CARE to comapre SCADA dataset.
Train on a subset of the datasets, then evaluate the model on all datasets in Wind Farm C.

Use the accuracy score as a simple metric to measure the performace.

In [1]:
#%pip install git+https://github.com/cair/tmu.git
#%pip install pycuda

In [2]:
# Import all required libraries

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from tmu.models.classification.vanilla_classifier import TMClassifier

2025-02-08 18:23:22,841 - tmu.clause_bank.clause_bank_cuda - ERROR - No module named 'pycuda'
Traceback (most recent call last):
  File "/Users/kjellhaaland/Documents/GitHub/uia-master-thesis/.venv/lib/python3.12/site-packages/tmu/clause_bank/clause_bank_cuda.py", line 41, in <module>
    from pycuda._driver import Device, Context
ModuleNotFoundError: No module named 'pycuda'


In [3]:
exclude_columns = ["time_stamp", "asset_id", "id", "status_type_id"],


def load_df_and_annotate_anomalies(farm, event_id):
    path = f"../../../data/care_to_compare/Wind Farm {farm}/datasets/{event_id}.csv"
    df = pd.read_csv(path, delimiter=';')

    event_info = pd.read_csv(f"../../../data/care_to_compare/Wind Farm {farm}/event_info.csv", delimiter=';')

    # Find the row where event_id = event_id
    metadata = event_info[event_info['event_id'] == event_id]

    event_label = metadata["event_label"].values[0]
    event_start_id = metadata["event_start_id"].values[0]
    event_end_id = metadata["event_end_id"].values[0]

    label_value = 1 if event_label == "anomaly" else 0

    # All rows where the column "id" is between event_start_id and event_end_id
    df['label'] = 0
    df.loc[(df['id'] >= event_start_id) & (df['id'] <= event_end_id), 'label'] = label_value

    # Include all columns except for the ones in exclude_columns
    df = df[[col for col in df.columns if col not in exclude_columns]]

    return df

In [4]:
# Load all required data for training

# Load csv file
df_train = pd.concat([
    load_df_and_annotate_anomalies('C', 55),
    load_df_and_annotate_anomalies('C', 81),
])

df_test = pd.concat([
    load_df_and_annotate_anomalies('C', 47),
    load_df_and_annotate_anomalies('C', 8),
])

# Sensors to use

train_data = df_train
test_data = df_test[df_test['train_test'] == 'prediction']

X_train = train_data.drop(columns=['label', 'train_test'])
y_train = train_data['label']

X_test = test_data.drop(columns=['label', 'train_test'])
y_test = test_data['label']

# Print number of 0s and 1s in the label sets
train_0s = np.count_nonzero(y_train == 0)
train_1s = np.count_nonzero(y_train == 1)

print(f"Train data: {train_0s} 0s and {train_1s} 1s")

test_0s = np.count_nonzero(y_test == 0)
test_1s = np.count_nonzero(y_test == 1)

print(f"Test data: {test_0s} 0s and {test_1s} 1s")

df_train.head()

Train data: 106848 0s and 2837 1s
Test data: 3106 0s and 713 1s


Unnamed: 0,time_stamp,asset_id,id,train_test,status_type_id,sensor_0_avg,sensor_0_max,sensor_0_min,sensor_0_std,sensor_1_avg,...,wind_speed_236_std,wind_speed_235_avg,wind_speed_235_max,wind_speed_235_min,wind_speed_235_std,wind_speed_237_avg,wind_speed_237_max,wind_speed_237_min,wind_speed_237_std,label
0,2017-10-27 11:30:00,50,0,train,0,88.531,92.3,83.9,1.483,-0.0023,...,1.177,13.067,16.5,7.0,1.242,13.404,17.4,7.0,1.458,0
1,2017-10-27 11:40:00,50,1,train,0,88.453,94.3,83.9,1.651,0.0037,...,1.268,13.375,16.6,8.1,1.276,13.778,18.4,7.7,1.614,0
2,2017-10-27 11:50:00,50,2,train,0,88.376,92.0,84.8,1.497,-0.0006,...,1.415,12.775,16.5,6.8,1.493,13.125,17.0,5.6,1.654,0
3,2017-10-27 12:00:00,50,3,train,0,88.385,92.3,84.8,1.497,-0.0027,...,1.146,13.017,16.3,7.6,1.191,13.35,16.9,6.4,1.448,0
4,2017-10-27 12:10:00,50,4,train,0,88.336,91.7,83.9,1.528,0.001,...,1.397,11.786,16.3,4.7,1.464,12.201,16.4,4.7,1.669,0


In [5]:
# Remove all columns where the value is not a number
X_train = X_train.apply(pd.to_numeric, errors='coerce')
X_train = X_train.dropna(axis=1)

X_test = X_test.apply(pd.to_numeric, errors='coerce')
X_test = X_test.dropna(axis=1)

In [6]:
def convert_to_10bit_integers(df):
    normalized_df = df.copy()

    for col in df.columns:
        min_val = df[col].min()
        max_val = df[col].max()

        # Edge case where all values are 0
        if min_val == max_val:
            normalized_df[col] = 0
        else:
            normalized_df[col] = ((df[col] - min_val) / (max_val - min_val) * 1023)

    # Convert the normalized values to integers
    int_df = normalized_df.astype(int)

    # Flatten each row into an array of 10-bit integers
    int_arrays = int_df.apply(lambda row: row.values.flatten(), axis=1).tolist()

    # Represent each cell as a 10-bit integer string
    bin_arrays = [[f"{cell:010b}" for cell in row] for row in int_arrays]

    # Split each 10-bit integer string into individual integers for each row
    # preserve the columns of bin_arrays
    bin_int_arrays = [[int(cell) for cell in list(''.join(row))] for row in bin_arrays]

    # Convert to numpy array
    int_arrays = np.array(bin_int_arrays)

    return int_arrays


# Example usage
X_train_binarized = convert_to_10bit_integers(X_train).astype(np.uint32)
X_test_binarized = convert_to_10bit_integers(X_test).astype(np.uint32)

y_train_binarized = y_train.values.astype(np.uint32)
y_test_binarized = y_test.values.astype(np.uint32)

# Print dimensions of the integer arrays
print(len(X_train_binarized), len(X_train_binarized[0]))

109685 9550


In [7]:
print(X_train_binarized.shape)
print(y_train_binarized.shape)
print(X_test_binarized.shape)
print(y_test_binarized.shape)

(109685, 9550)
(109685,)
(3819, 9550)
(3819,)


In [8]:
# Save the model
def save_model(model, filename):
    with open(filename, "wb") as f:
        pickle.dump(model, f)


def save_accuracy(epoch, accuracy, filename):
    with open(filename, "a") as f:
        f.write(f"{epoch},{accuracy}\n")

In [10]:
epochs = 10

tm = TMClassifier(
    number_of_clauses=1000,
    T=1000,
    s=10.0,
    max_included_literals=32,
    weighted_clauses=True,
    platform="CPU",
    batch_size=1000,
)

best_accuracy = 0

print(f"Running {TMClassifier} for {epochs} epochs")

for epoch in range(epochs):
    tm.fit(X_train_binarized, y_train_binarized)

    print("Finished fitting")

    pred = tm.predict(X_test_binarized)

    pred_0s = np.count_nonzero(pred == 0)
    pred_1s = np.count_nonzero(pred == 1)

    print(f"Predicted 0s: {pred_0s}, Predicted 1s: {pred_1s}")

    # the number of false positives based on g and p
    fp = np.sum((pred == 1) & (y_test_binarized == 0))

    # the number of true negatives based on g and p
    tn = np.sum((pred == 0) & (y_test_binarized == 0))

    acc = tn / (fp + tn)

    print(f"Accuracy = {acc:.5f}")

    save_accuracy(epoch, acc, "accuracy.txt")

    # Print every 20 epochs
    #if (epoch + 1) % 20 == 0:
    print(f"Epoch: {epoch + 1}, Accuracy: {acc:.5f}")

    if acc > best_accuracy:
        best_accuracy = acc

        print("Saving model")
        save_model(tm, "best.pkl")


Running <class 'tmu.models.classification.vanilla_classifier.TMClassifier'> for 10 epochs


OverflowError: Python integer -1 out of bounds for uint32