In [18]:
#%pip install git+https://github.com/cair/tmu.git
#%pip install numpy==1.26.4

In [19]:
import numpy as np
import pandas as pd
import pickle
import optuna
import os

from tmu.models.autoencoder.autoencoder import TMAutoEncoder

In [20]:
bits = 8

In [21]:
# Create folder models if it does not exist
os.makedirs("models", exist_ok=True)

In [22]:
def load_train_dataset(farm, event_id):
    X = np.loadtxt(f"./data_train/X_{farm}_{event_id}.txt", dtype=np.uint32)
    X = np.array(X).astype(np.uint32)
    return X


def load_test_dataset(farm, event_id):
    X = np.loadtxt(f"./data_test/X_{farm}_{event_id}.txt", dtype=np.uint32)
    X = np.array(X).astype(np.uint32)

    # Take a sample of 5000 rows
    X = X[:5000]

    return X


def load_test_labels(farm, event_id):
    # Load dataframe from file
    df = pd.read_csv(f"./data_test/y_{farm}_{event_id}.csv")

    labels = df['label'].values
    status_ids = df['status_type_id'].values
    train_test = df['train_test'].values

    # Take the first 3000 rows
    labels = labels[-10000:]
    status_ids = status_ids[-10000:]
    train_test = train_test[-10000:]

    return np.array(labels).astype(np.uint32), np.array(status_ids).astype(np.uint32), train_test


def load_thresh_dataset(farm, event_id):
    X = np.loadtxt(f"./data_train/X_{farm}_{event_id}.txt", dtype=np.uint32)
    X = np.array(X).astype(np.uint32)

    # Take the first 5000 rows
    X = X[:5000]

    return X


In [23]:
def save_model(tm: TMAutoEncoder, filename: str):
    a, d = tm.X_train, tm.encoded_X_train

    tm.X_train = None
    tm.encoded_X_train = None

    with open(f"./models/{filename}", "wb") as f:
        pickle.dump(tm, f)

    tm.X_train = a
    tm.encoded_X_train = d

In [24]:
train_datasets = [34, 7, ]  # 53, 27, 19, 77, 83, 52, 21, 2, 23, 87, 74, 86, 82]

# Load all datasets into one array
X_train = np.concatenate([load_train_dataset("B", i) for i in train_datasets])

X_test = load_test_dataset("B", 52)
y_test, status_ids, train_test = load_test_labels("B", 52)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (93206, 504)
X_test shape: (5000, 504)
y_test shape: (10000,)


In [25]:
def binary_to_decimal(arr, bit_length):
    # Split the array into chunks of bit_length
    numbers = [int("".join(map(str, arr[i:i + bit_length])), 2) for i in range(0, len(arr), bit_length)]
    return numbers


def huber_loss(pred, X_test, delta=1.0):
    # Reconstruct the original values (5 bits)
    p = binary_to_decimal(pred, bits)
    x = binary_to_decimal(X_test, bits)

    # Compute the Huber loss
    loss = np.where(np.abs(np.array(p) - np.array(x)) < delta, 0.5 * ((np.array(p) - np.array(x)) ** 2),
                    delta * (np.abs(np.array(p) - np.array(x)) - 0.5 * delta))

    return np.mean(loss)

In [26]:
def run_evaluation(tm: TMAutoEncoder) -> float:
    pred = tm.predict(X_test)

    loss = [huber_loss(X_test[i], pred[i]) for i in range(len(X_test))]

    print(f"Mean loss: {np.mean(loss)}, Median loss: {np.median(loss)}, Max loss: {np.max(loss)}")

    # Mean loss for 0s
    return np.mean(loss)


def objective(trial: optuna.Trial) -> float:
    number_of_features = X_train.shape[1]
    output_active = np.arange(number_of_features, dtype=np.uint32)

    number_of_clauses = trial.suggest_int("number_of_clauses", 20, 2000)
    T = trial.suggest_int("T", 20, 10000)
    max_included_literals = trial.suggest_int("max_included_literals", 1, 3 * number_of_features)

    tm = TMAutoEncoder(
        number_of_clauses=number_of_clauses,
        T=T,
        s=25.0,
        output_active=output_active,
        max_included_literals=max_included_literals,
        accumulation=1,
        feature_negation=False,
        platform="CPU",  # TODO: Change to CUDA
        output_balancing=0,
    )

    print(f"Starting training for trial {trial.number}")

    for e in range(5):
        tm.fit(X_train, number_of_examples=100)

    # Write the current best result to file "temp_best.txt"
    with open("temp_params.txt", "w") as f:
        f.write(f"Trial: {trial.number}\n")

    return run_evaluation(tm)


In [27]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=1)

# Save the best params to file
best_params = study.best_params

print(f"Best params: {best_params}")

with open("best_params.txt", "w") as f:
    for key, value in best_params.items():
        f.write(f"{key}: {value}\n")

Starting training for trial 0
Mean loss: 17.39273015873016, Median loss: 13.96031746031746, Max loss: 162.38888888888889
Best params: {'number_of_clauses': 930, 'T': 4185, 'max_included_literals': 115}


In [28]:
best_params = study.best_params

print(best_params)

{'number_of_clauses': 930, 'T': 4185, 'max_included_literals': 115}
