In [45]:
%pip install optuna


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [46]:
import numpy as np
import optuna
import pickle

from tmu.models.autoencoder.autoencoder import TMAutoEncoder

In [47]:
platform = "CPU"
windfarm = "B"

In [48]:
def load_train_dataset(farm, event_id):
    X = np.loadtxt(f"./data_train/X_{farm}_{event_id}.txt", dtype=np.uint32)
    X = np.array(X).astype(np.uint32)
    return X


def load_thresh_dataset(farm, event_id):
    X = np.loadtxt(f"./data_train/X_{farm}_{event_id}.txt", dtype=np.uint32)
    X = np.array(X).astype(np.uint32)

    # Take the first 5000 rows
    X = X[:5000]

    return X


def load_test_dataset(farm, event_id):
    X = np.loadtxt(f"./data_test/X_{farm}_{event_id}.txt", dtype=np.uint32)
    X = np.array(X).astype(np.uint32)
    return X


def load_test_labels(farm, event_id):
    y = np.loadtxt(f"./data_test/y_{farm}_{event_id}.txt", dtype=np.uint32)
    y = np.array(y).astype(np.uint32)
    return y


def load_model(filename) -> TMAutoEncoder:
    with open(filename, "rb") as f:
        model = pickle.load(f)

    return model

In [49]:
def calculate_reconstruction_accuracy(X, pred):
    correct = np.sum(X == pred)
    accuracy = correct / len(X)
    return accuracy


def calculate_accuracy(X, pred, labels, threshold=0.5):
    losses = [calculate_reconstruction_accuracy(X[i], pred[i]) for i in range(len(X))]

    predictions = np.array([1 if losses[i] < threshold else 0 for i in range(len(X))])

    accuracy = np.sum(labels == predictions) / len(labels)

    return accuracy


def calculate_threshold(X, pred):
    losses = [calculate_reconstruction_accuracy(X[i], pred[i]) for i in range(len(X))]

    # Set the threshold as the lowest 1% of the losses
    threshold = np.percentile(losses, 1)

    return threshold


In [50]:
train_datasets = [52, 21]
test_datasets = [34, 7]

thresh_datasets = [83]

# Load all datasets into one array
X_train = np.concatenate([load_train_dataset(windfarm, i) for i in train_datasets])

X_thresh = np.concatenate([load_thresh_dataset(windfarm, i) for i in thresh_datasets])

X_test = np.concatenate([load_test_dataset(windfarm, i) for i in test_datasets])
y_test = np.concatenate([load_test_labels(windfarm, i) for i in test_datasets])

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")

X_train shape: (91991, 315)
X_test shape: (114452, 315)


In [53]:
def run_evaluation(tm: TMAutoEncoder) -> float:
    pred_thresh = tm.predict(X_thresh)
    threshold = calculate_threshold(X_thresh, pred_thresh)

    pred = tm.predict(X_test)

    accuracy = calculate_accuracy(X_test, pred, y_test, threshold=threshold)

    print(f"Threshold: {threshold}")
    print(f"Test accuracy: {accuracy}")

    return accuracy


def objective(trial: optuna.Trial) -> float:
    number_of_features = X_train.shape[1]
    output_active = np.arange(number_of_features, dtype=np.uint32)

    number_of_clauses = trial.suggest_int("number_of_clauses", 50, 2000)
    T = trial.suggest_int("T", 50, 50000)
    s = trial.suggest_int("s", 1, 100)
    max_included_literals = trial.suggest_int("max_included_literals", 1, 3 * number_of_features)
    accumulation = trial.suggest_int("accumulation", 1, 10)
    feature_negation = trial.suggest_categorical("feature_negation", [True, False])
    output_balancing = trial.suggest_float("output_balancing", 0, 10)
    number_of_examples = trial.suggest_int("number_of_examples", 10, 1000)

    tm = TMAutoEncoder(
        number_of_clauses=number_of_clauses,
        T=T,
        s=s,
        output_active=output_active,
        max_included_literals=max_included_literals,
        accumulation=accumulation,
        feature_negation=feature_negation,
        platform="CPU",  # TODO: Change to CUDA
        output_balancing=output_balancing,
    )

    print(f"Finished training for trial {trial.number}")

    for e in range(15):
        tm.fit(X_train, number_of_examples=number_of_examples)
        print("Finished epoch", e)

    return run_evaluation(tm)


In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

# Save the best params to file
best_params = study.best_params

print(f"Best params: {best_params}")

with open("best_params.txt", "w") as f:
    for key, value in best_params.items():
        f.write(f"{key}: {value}\n")

Finished training for trial 0


In [35]:
model = load_model("latest.pkl")
test_acc = run_evaluation(model)

print(f"Test accuracy: {test_acc}")

Threshold: 0.8126984126984127
Test accuracy: 0.8457694055149757
Test accuracy: 0.8457694055149757
