# Regular Expression Dataset Generation

## Utilities
### Definitions

In [170]:
import re
from itertools import product
from itertools import islice
from string import ascii_letters
import csv
import numpy as np
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

import keras
from keras import layers
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding

In [15]:
def all_strings(max_length, alphabet):
    """Returns a generator of all strings up to a given length from an alphabet"""
    for length in range(max_length + 1):
        for s in product(alphabet, repeat=length):
            yield ''.join(s)

In [16]:
def extract_valid_regexes(candidates):
    """Takes a generator of strings and returns a generator of compiled valid regexes"""
    for c in candidates:
        try:
            yield re.compile(c)
        except:
            pass

In [17]:
def all_regexes(max_length, alphabet):
    """Returns a generator of all valid regexes up to a length from an alphabet"""
    return extract_valid_regexes(all_strings(max_length, alphabet))

## Dataset Generation Utilities
### Definitions

In [19]:
def regex_apply_all(regex, max_length, alphabet):
    """Returns a generator that takes a compiled regex and yields
    tuples for every string of length from alphabet with whether
    or not it matches the regex"""
    for string in all_strings(max_length, alphabet):
        yield (string, bool(regex.fullmatch(string)))

In [20]:
def triple_count_bound(regex_max_length, string_max_length, alphabet, regex_chars):
    """Returns an upper bound on the size of a dataset generated with the same parameters"""
    count_regexes = sum(len(alphabet + regex_chars) ** length for length in range(regex_max_length + 1))
    count_strings = sum(len(alphabet) ** length for length in range(string_max_length + 1))
    return count_regexes * count_strings

In [22]:
def triple_generator(regex_max_length, string_max_length, alphabet, regex_chars):
    """Returns a generator that gives for every regex-string pair
    up to a length whether or not they match"""
    for regex in all_regexes(regex_max_length, alphabet + regex_chars):
        for string in all_strings(string_max_length, alphabet):
            yield regex.pattern, string, bool(regex.fullmatch(string))

In [162]:
def dataset_generator_concat(regex_max_length, string_max_length, alphabet, regex_chars):
    """Returns a generator that gives for every regex-string pair
    up to a length whether or not they match, with strings encoded
    as character index lists"""
    char_int = {c: i + 1 for i, c in enumerate(alphabet + regex_chars)}
    concat_int = len(alphabet + regex_chars) + 1
    
    for regex in all_regexes(regex_max_length, alphabet + regex_chars):
        regex_ints = [char_int[c] for c in regex.pattern]
        for string in all_strings(string_max_length, alphabet):
            string_ints = [char_int[c] for c in string]
            if len(regex.pattern) != 0 and len(string) != 0:
                yield (regex_ints + [concat_int] + string_ints,
                      np.int64(int(bool(regex.fullmatch(string)))))

### Examples

In [163]:
list(islice(dataset_generator_concat(2, 2, "a", "*()"), 5))

Exception ignored in: <generator object extract_valid_regexes at 0x1533810a0>
RuntimeError: generator ignored GeneratorExit


[([1, 5, 1], 1),
 ([1, 5, 1, 1], 0),
 ([1, 1, 5, 1], 0),
 ([1, 1, 5, 1, 1], 1),
 ([1, 2, 5, 1], 1)]

## Generate Dataset

### Generate, Balance, and Split

In [164]:
# Generate
alphabet = "ab"
regex_chars = "|*()"
regex_max_len = 4
string_max_len = 4
X, y = map(np.array, zip(*dataset_generator_concat(regex_max_len, string_max_len,
                                                   alphabet, regex_chars)))
print("Initial data size: {}".format(X.shape))

# Balance
X, y = shuffle(X, y, random_state=42)
pos = (y == 1)
neg = (y == 0)
class_size = min(pos.sum(), len(pos) - pos.sum())
X_pos, y_pos = X[pos][:class_size], y[pos][:class_size]
X_neg, y_neg = X[neg][:class_size], y[neg][:class_size]

X = np.concatenate((X_pos, X_neg))
y = np.concatenate((y_pos, y_neg))

X, y = shuffle(X, y, random_state=42)

print("Balanced data size: {}".format(X.shape))

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

print("Train/Test split data size: {}/{}".format(X_train.shape, X_test.shape))

Initial data size: (8220,)
Balanced data size: (1036,)
Train/Test split data size: (694,)/(342,)


### Prepare

In [167]:
# Padding #TODO is this necessary?
max_len = regex_max_len + string_max_len + 1
X_train = sequence.pad_sequences(X_train, maxlen=max_len)
X_test = sequence.pad_sequences(X_test, maxlen=max_len)

## LSTM

In [185]:
# create the model
embedding_input_dim = len(alphabet + regex_chars) + 2
embedding_output_dim = 3

model = Sequential()
model.add(Embedding(embedding_input_dim, embedding_output_dim, input_length=max_len))
model.add(LSTM(10))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=64)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_46 (Embedding)     (None, 9, 3)              24        
_________________________________________________________________
lstm_46 (LSTM)               (None, 10)                560       
_________________________________________________________________
dense_26 (Dense)             (None, 1)                 11        
Total params: 595
Trainable params: 595
Non-trainable params: 0
_________________________________________________________________
None
Train on 694 samples, validate on 342 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Ep

<keras.callbacks.History at 0x149efb128>

In [184]:
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 73.10%
