In [171]:
#%pip install git+https://github.com/cair/tmu.git
#%pip install numpy==1.26.4

In [172]:
import numpy as np
import pickle
import os

from tmu.models.autoencoder.autoencoder import TMAutoEncoder

In [173]:
bits = 10

In [174]:
# Create folder models if it does not exist
os.makedirs("models", exist_ok=True)

In [175]:
def load_train_dataset(farm, event_id):
    X = np.loadtxt(f"./data_train/X_{farm}_{event_id}.txt", dtype=np.uint32)
    X = np.array(X).astype(np.uint32)
    return X


def load_test_dataset(farm, event_id):
    X = np.loadtxt(f"./data_test/X_{farm}_{event_id}.txt", dtype=np.uint32)
    X = np.array(X).astype(np.uint32)
    return X


def load_test_labels(farm, event_id):
    y = np.loadtxt(f"./data_test/y_{farm}_{event_id}.txt", dtype=np.uint32)
    y = np.array(y).astype(np.uint32)
    return y


In [176]:
def save_model(tm: TMAutoEncoder, filename: str):
    a, d = tm.X_train, tm.encoded_X_train

    tm.X_train = None
    tm.encoded_X_train = None

    with open(f"./models/{filename}", "wb") as f:
        pickle.dump(tm, f)

    tm.X_train = a
    tm.encoded_X_train = d

In [177]:
# TODO: Load X_train from a dataset

train_datasets = [25, 69, 13, 24, 3, 17, 38, 71, 14, 92, 51]
test_dataset = [68, 22]
# Load all datasets into one array
X_train = np.concatenate([load_train_dataset("A", i) for i in train_datasets])

X_test = np.concatenate([load_test_dataset("A", i) for i in test_dataset])
y_test = np.concatenate([load_test_labels("A", i) for i in test_dataset])

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (430495, 540)
X_test shape: (3442, 540)
y_test shape: (3442,)


In [178]:
def binary_to_float(bin_array):
    """Convert a 10-bit binary array to a float between 0 and 1."""
    return np.dot(bin_array, 2 ** np.arange(len(bin_array))[::-1]) / (2 ** len(bin_array) - 1)


def mse_loss(X, pred, bits_per_value=bits):
    """
    Compute MSE loss for flattened binary inputs.
    - X and pred are 1D arrays of length `num_values * bits_per_value`.
    - We reshape them into (num_values, bits_per_value) before converting.
    """

    num_values = int(len(X) // bits_per_value)

    # Reshape into (num_values, bits_per_value)
    X_reshaped = X.reshape(num_values, bits_per_value)
    pred_reshaped = pred.reshape(num_values, bits_per_value)

    # Convert binary sequences back to float values
    X_floats = np.array([binary_to_float(row) for row in X_reshaped])
    pred_floats = np.array([binary_to_float(row) for row in pred_reshaped])

    # Compute MSE
    mse = np.mean((X_floats - pred_floats) ** 2)
    return mse

In [179]:
def reconstruction_accuracy(X, pred):
    correct = np.sum(X == pred)
    accuracy = correct / len(X)
    return accuracy


def test(tm, X, y):
    X_0s = X[y == 0]
    X_1s = X[y == 1]

    pred_0s = tm.predict(X_0s)
    pred_1s = tm.predict(X_1s)

    r_0s = [mse_loss(X_0s[i], pred_0s[i]) for i in range(len(X_0s))]
    r_1s = [mse_loss(X_1s[i], pred_1s[i]) for i in range(len(X_1s))]

    return np.median(r_0s), np.median(r_1s), len(X_0s), len(X_1s)

In [180]:
def train(args):
    tm = TMAutoEncoder(
        number_of_clauses=args["num_clauses"],
        T=args["T"],
        s=args["s"],
        output_active=args["output_active"],
        max_included_literals=args["max_included_literals"],
        accumulation=args["accumulation"],
        feature_negation=args["feature_negation"],
        platform=args["platform"],
        output_balancing=args["output_balancing"],
    )

    print(f"Starting training for {args['epochs']} epochs")

    for e in range(args["epochs"]):
        tm.fit(X_train, number_of_examples=args["number_of_examples"])

        nra, ara, nc, ac = test(tm, X_test, y_test)
        print(f"Epoch: {e + 1} MAccN: {nra}, MAccA: {ara}, NCount: {nc}, ACount: {ac}")

        save_model(tm, f"latest_{e}.pkl")


In [181]:
number_of_features = X_train.shape[1]
output_active = np.arange(number_of_features, dtype=np.uint32)

number_of_clauses = 500

args: dict = {
    "clause_weight_threshold": 1,
    "number_of_examples": 100,
    "output_active": output_active,
    "accumulation": 1,
    "num_clauses": number_of_clauses,
    "T": int(number_of_clauses * 75),
    "s": 25,
    "epochs": 25,
    "platform": "CPU",
    "output_balancing": 0,
    "max_included_literals": number_of_features,
    "feature_negation": True,
}

result = train(args)

Starting training for 25 epochs
Epoch: 1 MAccN: 0.05463786585093304, MAccA: 0.06367884622333377, NCount: 424, ACount: 3018
Epoch: 2 MAccN: 0.03140254682471859, MAccA: 0.04384847787658412, NCount: 424, ACount: 3018
Epoch: 3 MAccN: 0.02393907401054838, MAccA: 0.03642326204051679, NCount: 424, ACount: 3018
Epoch: 4 MAccN: 0.019103910447103037, MAccA: 0.034731408232285904, NCount: 424, ACount: 3018
Epoch: 5 MAccN: 0.018708954040416434, MAccA: 0.03352468546552992, NCount: 424, ACount: 3018
Epoch: 6 MAccN: 0.017157113340066706, MAccA: 0.03265889572241332, NCount: 424, ACount: 3018
Epoch: 7 MAccN: 0.014604371353443762, MAccA: 0.032244474618264546, NCount: 424, ACount: 3018
Epoch: 8 MAccN: 0.015876371637416006, MAccA: 0.03064529931272277, NCount: 424, ACount: 3018
Epoch: 9 MAccN: 0.015955053253111882, MAccA: 0.030673974351120417, NCount: 424, ACount: 3018
Epoch: 10 MAccN: 0.01681602990740148, MAccA: 0.029521664261360915, NCount: 424, ACount: 3018
Epoch: 11 MAccN: 0.015664861156720437, MAccA: 0