In [33]:
#%pip install git+https://github.com/cair/tmu.git
#%pip install numpy==1.26.4

In [34]:
import numpy as np
import pickle
import optuna
import os

from tmu.models.autoencoder.autoencoder import TMAutoEncoder

In [35]:
bits = 10

In [36]:
# Create folder models if it does not exist
os.makedirs("models", exist_ok=True)

In [37]:
def load_train_dataset(farm, event_id):
    X = np.loadtxt(f"./data_train/X_{farm}_{event_id}.txt", dtype=np.uint32)
    X = np.array(X).astype(np.uint32)
    return X


def load_test_dataset(farm, event_id):
    X = np.loadtxt(f"./data_test/X_{farm}_{event_id}.txt", dtype=np.uint32)
    X = np.array(X).astype(np.uint32)

    # Take a sample of 5000 rows
    X = X[:5000]

    return X


def load_test_labels(farm, event_id):
    y = np.loadtxt(f"./data_test/y_{farm}_{event_id}.txt", dtype=np.uint32)
    y = np.array(y).astype(np.uint32)

    # Take a sample of 5000 rows
    y = y[:5000]

    return y


def load_thresh_dataset(farm, event_id):
    X = np.loadtxt(f"./data_train/X_{farm}_{event_id}.txt", dtype=np.uint32)
    X = np.array(X).astype(np.uint32)

    # Take the first 5000 rows
    X = X[:5000]

    return X


In [38]:
def save_model(tm: TMAutoEncoder, filename: str):
    a, d = tm.X_train, tm.encoded_X_train

    tm.X_train = None
    tm.encoded_X_train = None

    with open(f"./models/{filename}", "wb") as f:
        pickle.dump(tm, f)

    tm.X_train = a
    tm.encoded_X_train = d

In [39]:
# TODO: Load X_train from a dataset

train_datasets = [25, 69, 13]  #, 24, 3, 17, 38, 71, 14, 92, 51]
test_dataset = [51]

# Load all datasets into one array
X_train = np.concatenate([load_train_dataset("A", i) for i in train_datasets])

X_test = np.concatenate([load_test_dataset("A", i) for i in test_dataset])
y_test = np.concatenate([load_test_labels("A", i) for i in test_dataset])

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (119797, 432)
X_test shape: (2393, 432)
y_test shape: (2393,)


In [40]:
def binary_to_float(bin_array):
    """Convert a 10-bit binary array to a float between 0 and 1."""
    return np.dot(bin_array, 2 ** np.arange(len(bin_array))[::-1]) / (2 ** len(bin_array) - 1)


def mse_loss(X, pred, bits_per_value=bits):
    """
    Compute MSE loss for flattened binary inputs.
    - X and pred are 1D arrays of length `num_values * bits_per_value`.
    - We reshape them into (num_values, bits_per_value) before converting.
    """

    num_values = int(len(X) // bits_per_value)

    # Reshape into (num_values, bits_per_value)
    X_reshaped = X.reshape(num_values, bits_per_value)
    pred_reshaped = pred.reshape(num_values, bits_per_value)

    # Convert binary sequences back to float values
    X_floats = np.array([binary_to_float(row) for row in X_reshaped])
    pred_floats = np.array([binary_to_float(row) for row in pred_reshaped])

    # Compute MSE
    mse = np.mean((X_floats - pred_floats) ** 2)
    return mse


def mae_loss(X, pred, bits_per_value=bits):
    """
    Compute MAE loss for flattened binary inputs.
    - X and pred are 1D arrays of length `num_values * bits_per_value`.
    - We reshape them into (num_values, bits_per_value) before converting.
    """

    num_values = int(len(X) // bits_per_value)

    # Reshape into (num_values, bits_per_value)
    X_reshaped = X.reshape(num_values, bits_per_value)
    pred_reshaped = pred.reshape(num_values, bits_per_value)

    # Convert binary sequences back to float values
    X_floats = np.array([binary_to_float(row) for row in X_reshaped])
    pred_floats = np.array([binary_to_float(row) for row in pred_reshaped])

    # Compute MAE
    mae = np.mean(np.abs(X_floats - pred_floats))
    return mae


def hamming_loss(pred, X_test):
    """
    Computes the Hamming loss between predicted and ground truth binary arrays.

    Parameters:
    - pred (numpy array): Binary predictions of shape (n_samples, n_bits).
    - X_test (numpy array): Ground truth binary values of shape (n_samples, n_bits).

    Returns:
    - float: Hamming loss (fraction of incorrect bits).
    """
    assert pred.shape == X_test.shape, "Shapes of pred and X_test must match"

    # Compute the number of differing bits
    incorrect_bits = np.sum(pred != X_test)

    # Total number of bits
    total_bits = np.prod(X_test.shape)

    # Hamming loss is the fraction of incorrect bits
    return incorrect_bits / total_bits

In [41]:
def run_evaluation(tm: TMAutoEncoder) -> float:
    pred = tm.predict(X_test)

    loss = [hamming_loss(X_test[i], pred[i]) for i in range(len(X_test))]

    print(f"Mean loss: {np.mean(loss)}, Median loss: {np.median(loss)}, Max loss: {np.max(loss)}")

    # Mean loss for 0s
    return np.mean(loss)


def objective(trial: optuna.Trial) -> float:
    number_of_features = X_train.shape[1]
    output_active = np.arange(number_of_features, dtype=np.uint32)

    number_of_clauses = trial.suggest_int("number_of_clauses", 20, 2000)
    T = trial.suggest_int("T", 20, 10000)
    s = trial.suggest_int("s", 1, 100)
    max_included_literals = trial.suggest_int("max_included_literals", 1, 3 * number_of_features)
    accumulation = trial.suggest_int("accumulation", 1, 10)
    feature_negation = trial.suggest_categorical("feature_negation", [True, False])
    output_balancing = trial.suggest_float("output_balancing", 0, 10)
    number_of_examples = trial.suggest_int("number_of_examples", 100, 100)

    tm = TMAutoEncoder(
        number_of_clauses=number_of_clauses,
        T=T,
        s=s,
        output_active=output_active,
        max_included_literals=max_included_literals,
        accumulation=accumulation,
        feature_negation=feature_negation,
        platform="CPU",  # TODO: Change to CUDA
        output_balancing=output_balancing,
    )

    print(f"Starting training for trial {trial.number}")

    for e in range(5):
        tm.fit(X_train, number_of_examples=number_of_examples)

    return run_evaluation(tm)


In [42]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=150)

# Save the best params to file
best_params = study.best_params

print(f"Best params: {best_params}")

with open("best_params.txt", "w") as f:
    for key, value in best_params.items():
        f.write(f"{key}: {value}\n")

Starting training for trial 0
Mean loss: 0.42658564331151044, Median loss: 0.4305555555555556, Max loss: 0.5393518518518519
Starting training for trial 1
Mean loss: 0.42662240175821453, Median loss: 0.4305555555555556, Max loss: 0.5416666666666666
Starting training for trial 2
Mean loss: 0.6043001578678553, Median loss: 0.6134259259259259, Max loss: 0.7060185185185185
Starting training for trial 3
Mean loss: 0.42658564331151044, Median loss: 0.4305555555555556, Max loss: 0.5393518518518519
Starting training for trial 4
Mean loss: 0.4278857315317825, Median loss: 0.4305555555555556, Max loss: 0.6527777777777778
Starting training for trial 5
Mean loss: 0.5750404342913745, Median loss: 0.5717592592592593, Max loss: 0.6851851851851852
Starting training for trial 6
Mean loss: 0.512250236027921, Median loss: 0.5486111111111112, Max loss: 0.6712962962962963
Starting training for trial 7
Mean loss: 0.42658564331151044, Median loss: 0.4305555555555556, Max loss: 0.5393518518518519
Starting trai

KeyboardInterrupt: 

In [44]:
best_params = study.best_params

print(best_params)

{'number_of_clauses': 1037, 'T': 9242, 's': 99, 'max_included_literals': 555, 'accumulation': 1, 'feature_negation': False, 'output_balancing': 0.6877586247267895, 'number_of_examples': 100}
