# Regular Expression Dataset Generation

## Utilities
### Definitions

In [1]:
import re
from itertools import product
from string import ascii_letters
import csv
import numpy as np

In [2]:
def all_strings(max_length, alphabet):
    """Returns a generator of all strings up to a given length from an alphabet"""
    for length in range(max_length + 1):
        for s in product(alphabet, repeat=length):
            yield ''.join(s)

In [3]:
def extract_valid_regexes(candidates):
    """Takes a generator of strings and returns a generator of compiled valid regexes"""
    for c in candidates:
        try:
            yield re.compile(c)
        except:
            pass

In [4]:
def all_regexes(max_length, alphabet):
    """Returns a generator of all valid regexes up to a length from an alphabet"""
    return extract_valid_regexes(all_strings(max_length, alphabet))

### Examples

In [5]:
print(list(all_strings(2, "ab")))
print(list(all_regexes(3, "ab|*"))[:6])

['', 'a', 'b', 'aa', 'ab', 'ba', 'bb']
[re.compile(''), re.compile('a'), re.compile('b'), re.compile('|'), re.compile('aa'), re.compile('ab')]


## Dataset Generation
### Definitions

In [6]:
def regex_apply_all(regex, max_length, alphabet):
    """Returns a generator that takes a compiled regex and yields
    tuples for every string of length from alphabet with whether
    or not it matches the regex"""
    for string in all_strings(max_length, alphabet):
        yield (string, bool(regex.fullmatch(string)))

In [7]:
def triple_count_bound(regex_max_length, string_max_length, alphabet, regex_chars):
    """Returns an upper bound on the size of a dataset generated with the same parameters"""
    count_regexes = sum(len(alphabet + regex_chars) ** length for length in range(regex_max_length + 1))
    count_strings = sum(len(alphabet) ** length for length in range(string_max_length + 1))
    return count_regexes * count_strings

In [8]:
def triple_generator(regex_max_length, string_max_length, alphabet, regex_chars):
    """Returns a generator that gives for every regex-string pair
    up to a length whether or not they match"""
    for regex in all_regexes(regex_max_length, alphabet + regex_chars):
        for string in all_strings(string_max_length, alphabet):
            yield regex.pattern, string, bool(regex.fullmatch(string))

### Examples

In [9]:
list(triple_generator(2, 2, "ab", "|*()"))

[('', '', True),
 ('', 'a', False),
 ('', 'b', False),
 ('', 'aa', False),
 ('', 'ab', False),
 ('', 'ba', False),
 ('', 'bb', False),
 ('a', '', False),
 ('a', 'a', True),
 ('a', 'b', False),
 ('a', 'aa', False),
 ('a', 'ab', False),
 ('a', 'ba', False),
 ('a', 'bb', False),
 ('b', '', False),
 ('b', 'a', False),
 ('b', 'b', True),
 ('b', 'aa', False),
 ('b', 'ab', False),
 ('b', 'ba', False),
 ('b', 'bb', False),
 ('|', '', True),
 ('|', 'a', False),
 ('|', 'b', False),
 ('|', 'aa', False),
 ('|', 'ab', False),
 ('|', 'ba', False),
 ('|', 'bb', False),
 ('aa', '', False),
 ('aa', 'a', False),
 ('aa', 'b', False),
 ('aa', 'aa', True),
 ('aa', 'ab', False),
 ('aa', 'ba', False),
 ('aa', 'bb', False),
 ('ab', '', False),
 ('ab', 'a', False),
 ('ab', 'b', False),
 ('ab', 'aa', False),
 ('ab', 'ab', True),
 ('ab', 'ba', False),
 ('ab', 'bb', False),
 ('a|', '', True),
 ('a|', 'a', True),
 ('a|', 'b', False),
 ('a|', 'aa', False),
 ('a|', 'ab', False),
 ('a|', 'ba', False),
 ('a|', 'bb', F

In [10]:
triple_count_bound(5, 15, "ab", "|*")

89455275

In [11]:
len(list(triple_generator(2, 10, "abc", "|*()")))

2214325

## Writing Dataset

### Padded Dataset

In [13]:
def integer_encode(X, char_to_int, max_len):
    Xenc = list()
    for pattern in X:
        pattern = pattern.ljust(max_len)
        integer_encoded = [char_to_int[char] for char in pattern]
        Xenc.append(integer_encoded)
    return Xenc

In [14]:
def one_hot_encode(X, alphabet_size):
    Xenc = list()
    for seq in X:
        pattern = list()
        for index in seq:
            vector = [0 for _ in range(alphabet_size)]
            if index != 0:
                vector[index - 1] = 1
            pattern.append(vector)
        Xenc.append(pattern)
    return Xenc

In [15]:
def generate_data(max_regex_len, max_string_len, alphabet, regex_chars):
    # embedding parameters
    pad_char = " "
    char_to_int = dict((c, i) for i, c in enumerate(pad_char + alphabet + regex_chars))

    # generate dataset
    regexes, strings, labels = map(list, zip(*triple_generator(max_regex_len, max_string_len, alphabet, regex_chars)))

    X_r = integer_encode(regexes, char_to_int, max_regex_len)
    X_s = integer_encode(strings, char_to_int, max_string_len)

    X_r = one_hot_encode(X_r, len(alphabet + regex_chars)) # note, sets integer 0 to zero embedding
    X_s = one_hot_encode(X_s, len(alphabet))
    
    y = [int(b) for b in labels]
    
    X_r = np.array(X_r)
    X_s = np.array(X_s)
    y = np.array(y, ndmin=2).T
    
    return X_r, X_s, y

In [17]:
# define problem
max_regex_len = 5
max_string_len = 10
alphabet = "ab"
regex_chars = "|*()"

X_r, X_s, y = generate_data(max_regex_len, max_string_len, alphabet, regex_chars)

In [18]:
config = "test"
np.save(config + "_X_r", X_r)
np.save(config + "_X_s", X_s)
np.save(config + "_y", y)

6638
(2468682, 1)
