In [36]:
#%pip install git+https://github.com/cair/tmu.git
#%pip install numpy==1.26.4

In [37]:
import numpy as np
import pickle
import os

from tmu.models.autoencoder.autoencoder import TMAutoEncoder

In [38]:
# Create folder models if it does not exist
os.makedirs("models", exist_ok=True)

In [39]:
def load_train_dataset(farm, event_id):
    X = np.loadtxt(f"./data_train/X_{farm}_{event_id}.txt", dtype=np.uint32)
    X = np.array(X).astype(np.uint32)
    return X


def load_test_dataset(farm, event_id):
    X = np.loadtxt(f"./data_test/X_{farm}_{event_id}.txt", dtype=np.uint32)
    X = np.array(X).astype(np.uint32)

    X = X[:1000]  # Take the first 1000 rows

    return X


In [40]:
def save_model(tm: TMAutoEncoder, filename: str):
    a, d = tm.X_train, tm.encoded_X_train

    tm.X_train = None
    tm.encoded_X_train = None

    with open(f"./models/{filename}", "wb") as f:
        pickle.dump(tm, f)

    tm.X_train = a
    tm.encoded_X_train = d

In [41]:
train_datasets = [81, 55]  #, 7]  #, 53, 27, 19, ]  # 77, 83, 52, 21, 2, 23, 87, 74, 86, 82]
test_dataset = [47]

X_train = np.concatenate([load_train_dataset("C", i) for i in train_datasets])

X_test = np.concatenate([load_test_dataset("C", i) for i in test_dataset])

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")

X_train shape: (8470, 7600)
X_test shape: (1000, 7600)


In [42]:
def hamming_loss(pred, X_test):
    """
    Computes the Hamming loss between predicted and ground truth binary arrays.

    Parameters:
    - pred (numpy array): Binary predictions of shape (n_samples, n_bits).
    - X_test (numpy array): Ground truth binary values of shape (n_samples, n_bits).

    Returns:
    - float: Hamming loss (fraction of incorrect bits).
    """
    assert pred.shape == X_test.shape, "Shapes of pred and X_test must match"

    # Compute the number of differing bits
    incorrect_bits = np.sum(pred != X_test)

    # Total number of bits
    total_bits = np.prod(X_test.shape)

    # Hamming loss is the fraction of incorrect bits
    return incorrect_bits / total_bits

In [43]:
def test(tm, X):
    pred = tm.predict(X)

    loss = [hamming_loss(X[i], pred[i]) for i in range(len(X))]

    return np.mean(loss), np.median(loss), np.max(loss), np.min(loss)

In [44]:
def train(args):
    tm = TMAutoEncoder(
        number_of_clauses=args["num_clauses"],
        T=args["T"],
        s=args["s"],
        output_active=args["output_active"],
        max_included_literals=args["max_included_literals"],
        accumulation=args["accumulation"],
        feature_negation=args["feature_negation"],
        platform=args["platform"],
        output_balancing=args["output_balancing"],
    )

    print(f"Starting training for {args['epochs']} epochs")

    for e in range(args["epochs"]):
        tm.fit(X_train, number_of_examples=args["number_of_examples"])

        lmean, lmed, lmax, lmin = test(tm, X_test)
        print(f"Epoch: {e + 1} Mean loss: {lmean:4f}, Median loss: {lmed:4f}, Max loss: {lmax:4f}, Min loss: {lmin:4f}")

        save_model(tm, f"latest_{e}.pkl")


In [45]:
number_of_features = X_train.shape[1]
output_active = np.arange(number_of_features, dtype=np.uint32)

number_of_clauses = 5000

print(f"Number of features {number_of_features}")
print(f"Number of clauses {number_of_clauses}")

Number of features 7600
Number of clauses 5000


In [46]:
args: dict = {
    "number_of_examples": 100,
    "output_active": output_active,
    "accumulation": 1,
    "num_clauses": number_of_clauses,
    "T": int(number_of_clauses * 2),
    "s": 15.0,
    "epochs": 25,
    "platform": "CPU",
    "output_balancing": 0,
    "max_included_literals": 2000,
    "feature_negation": True,
}

result = train(args)

Starting training for 25 epochs
Epoch: 1 Mean loss: 0.508802, Median loss: 0.508947, Max loss: 0.524605, Min loss: 0.491316
Epoch: 2 Mean loss: 0.509387, Median loss: 0.509474, Max loss: 0.524605, Min loss: 0.491316
Epoch: 3 Mean loss: 0.509387, Median loss: 0.509474, Max loss: 0.524605, Min loss: 0.491316


KeyboardInterrupt: 