In [33]:
import numpy as np
import random
import re
import tensorflow as tf
from tensorflow import keras
from keras import layers

In [34]:
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

In [35]:
alphabet = ['a', 'b', 'c', 'd']

In [36]:
def random_string(length=15):
    return ''.join(random.choices(alphabet, k=length))

In [37]:
# We choose a regex that looks for a 5-character substring matching:
#   - first character: a
#   - second character: either c or d
#   - third character: b
#   - fourth character: either c or d
#   - fifth character: a
pattern = re.compile(r'a[cd]b[cd]a')

In [38]:
char_to_index = {'a': 0, 'b': 1, 'c': 2, 'd': 3}
def one_hot_encode_string(s):
    # Returns an array of shape (15, 4)
    indices = [char_to_index[c] for c in s]
    return np.eye(4)[indices]

In [39]:
# Functions to generate positive and negative examples.
def generate_positive():
    """
    Generate a random string of length 15 and force-insert a substring
    that matches the regex pattern.
    """
    s = list(random_string(15))
    insert_pos = random.randint(0, 15 - 5)  # valid insertion positions
    # Create a substring that matches the pattern: a, (c or d), b, (c or d), a
    positive_sub = ['a', random.choice(['c', 'd']), 'b', random.choice(['c', 'd']), 'a']
    s[insert_pos:insert_pos+5] = positive_sub
    return ''.join(s)

def generate_negative():
    """
    Generate a random string that does NOT contain any substring that matches
    the regex pattern.
    """
    while True:
        s = random_string(15)
        if pattern.search(s) is None:
            return s

In [40]:
n_examples = 10000
n_positive = n_examples // 2
n_negative = n_examples - n_positive

data = []
labels = []

In [41]:
# Generate positive examples (label 1)
for _ in range(n_positive):
    data.append(generate_positive())
    labels.append(1)

# Generate negative examples (label 0)
for _ in range(n_negative):
    data.append(generate_negative())
    labels.append(0)

# Shuffle the dataset so that positives and negatives are mixed.
combined = list(zip(data, labels))
random.shuffle(combined)
data, labels = zip(*combined)
data = list(data)
labels = np.array(labels)

In [42]:
# One-hot encode the data: each string becomes an array of shape (15, 4)
X = np.array([one_hot_encode_string(s) for s in data])
y = labels

print("Dataset shapes:")
print("X:", X.shape)  # (10000, 15, 4)
print("y:", y.shape)

Dataset shapes:
X: (10000, 15, 4)
y: (10000,)


In [43]:
split_index = int(0.8 * len(X))
X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

Training set shape: (8000, 15, 4)
Testing set shape: (2000, 15, 4)


In [44]:
model = keras.Sequential([
    layers.Conv1D(filters=1, kernel_size=5, activation='relu', input_shape=(15, 4)),
    layers.Flatten(),
    layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [45]:
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1)

Epoch 1/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 671us/step - accuracy: 0.5041 - loss: 0.7367 - val_accuracy: 0.5725 - val_loss: 0.6897
Epoch 2/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 456us/step - accuracy: 0.5682 - loss: 0.6911 - val_accuracy: 0.6025 - val_loss: 0.6752
Epoch 3/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 442us/step - accuracy: 0.6016 - loss: 0.6711 - val_accuracy: 0.6350 - val_loss: 0.6500
Epoch 4/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 454us/step - accuracy: 0.6587 - loss: 0.6390 - val_accuracy: 0.7125 - val_loss: 0.6185
Epoch 5/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 443us/step - accuracy: 0.7170 - loss: 0.6034 - val_accuracy: 0.7625 - val_loss: 0.5863
Epoch 6/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 442us/step - accuracy: 0.7619 - loss: 0.5717 - val_accuracy: 0.7638 - val_loss: 0.5593
Epoch 7/10
[1m2

<keras.src.callbacks.history.History at 0x16c4bfbe0>

In [46]:
loss, accuracy = model.evaluate(X_test, y_test)
print("\nModel Test Accuracy:", accuracy)

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 494us/step - accuracy: 0.8169 - loss: 0.4973

Model Test Accuracy: 0.8190000057220459


In [47]:
# Model with additional convolutional and dense layers.
model_complex = keras.Sequential([
    layers.Conv1D(filters=8, kernel_size=5, activation='relu', input_shape=(15, 4)),
    layers.MaxPooling1D(pool_size=2),
    layers.Conv1D(filters=16, kernel_size=3, activation='relu'),
    layers.Flatten(),
    layers.Dense(32, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

In [48]:
model_complex.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_complex.summary()

model_complex.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1)

loss, accuracy = model_complex.evaluate(X_test, y_test)
print("\nComplex Model Test Accuracy:", accuracy)

Epoch 1/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 868us/step - accuracy: 0.6053 - loss: 0.6658 - val_accuracy: 0.7638 - val_loss: 0.4998
Epoch 2/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 591us/step - accuracy: 0.8018 - loss: 0.4403 - val_accuracy: 0.8913 - val_loss: 0.2946
Epoch 3/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 588us/step - accuracy: 0.8982 - loss: 0.2656 - val_accuracy: 0.9287 - val_loss: 0.1946
Epoch 4/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 588us/step - accuracy: 0.9361 - loss: 0.1706 - val_accuracy: 0.9425 - val_loss: 0.1339
Epoch 5/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 575us/step - accuracy: 0.9586 - loss: 0.1071 - val_accuracy: 0.9638 - val_loss: 0.0890
Epoch 6/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 585us/step - accuracy: 0.9753 - loss: 0.0705 - val_accuracy: 0.9688 - val_loss: 0.0638
Epoch 7/10
[1m2