# Regular Expression Dataset Generation

## Utilities
### Definitions

In [84]:
import re
from itertools import product
from string import ascii_letters
import csv
import numpy as np

In [4]:
def all_strings(max_length, alphabet):
    """Returns a generator of all strings up to a given length from an alphabet"""
    for length in range(max_length + 1):
        for s in product(alphabet, repeat=length):
            yield ''.join(s)

In [5]:
def extract_valid_regexes(candidates):
    """Takes a generator of strings and returns a generator of compiled valid regexes"""
    for c in candidates:
        try:
            yield re.compile(c)
        except:
            pass

In [6]:
def all_regexes(max_length, alphabet):
    """Returns a generator of all valid regexes up to a length from an alphabet"""
    return extract_valid_regexes(all_strings(max_length, alphabet))

### Examples

In [7]:
print(list(all_strings(2, "ab")))
print(list(all_regexes(3, "ab|*"))[:6])

['', 'a', 'b', 'aa', 'ab', 'ba', 'bb']
[re.compile(''), re.compile('a'), re.compile('b'), re.compile('|'), re.compile('aa'), re.compile('ab')]


## Dataset Generation
### Definitions

In [8]:
def regex_apply_all(regex, max_length, alphabet):
    """Returns a generator that takes a compiled regex and yields
    tuples for every string of length from alphabet with whether
    or not it matches the regex"""
    for string in all_strings(max_length, alphabet):
        yield (string, bool(regex.fullmatch(string)))

In [9]:
def dataset_size_bound(regex_max_length, string_max_length, alphabet, regex_chars):
    """Returns an upper bound on the size of a dataset generated with the same parameters"""
    count_regexes = sum(len(alphabet + regex_chars) ** length for length in range(regex_max_length + 1))
    count_strings = sum(len(alphabet) ** length for length in range(string_max_length + 1))
    return count_regexes * count_strings

In [40]:
def dataset_generator(regex_max_length, string_max_length, alphabet, regex_chars):
    """Returns a generator that gives for every regex-string pair
    up to a length whether or not they match"""
    for regex in all_regexes(regex_max_length, alphabet + regex_chars):
        for string in all_strings(string_max_length, alphabet):
            yield regex.pattern, string, bool(regex.fullmatch(string))

### Examples

In [11]:
list(regex_apply_all(re.compile(r'a|aa'), 2, "ab"))

[('', False),
 ('a', True),
 ('b', False),
 ('aa', True),
 ('ab', False),
 ('ba', False),
 ('bb', False)]

In [69]:
list(dataset_generator(2, 2, "ab", "|*()"))

[('', '', True),
 ('', 'a', False),
 ('', 'b', False),
 ('', 'aa', False),
 ('', 'ab', False),
 ('', 'ba', False),
 ('', 'bb', False),
 ('a', '', False),
 ('a', 'a', True),
 ('a', 'b', False),
 ('a', 'aa', False),
 ('a', 'ab', False),
 ('a', 'ba', False),
 ('a', 'bb', False),
 ('b', '', False),
 ('b', 'a', False),
 ('b', 'b', True),
 ('b', 'aa', False),
 ('b', 'ab', False),
 ('b', 'ba', False),
 ('b', 'bb', False),
 ('|', '', True),
 ('|', 'a', False),
 ('|', 'b', False),
 ('|', 'aa', False),
 ('|', 'ab', False),
 ('|', 'ba', False),
 ('|', 'bb', False),
 ('aa', '', False),
 ('aa', 'a', False),
 ('aa', 'b', False),
 ('aa', 'aa', True),
 ('aa', 'ab', False),
 ('aa', 'ba', False),
 ('aa', 'bb', False),
 ('ab', '', False),
 ('ab', 'a', False),
 ('ab', 'b', False),
 ('ab', 'aa', False),
 ('ab', 'ab', True),
 ('ab', 'ba', False),
 ('ab', 'bb', False),
 ('a|', '', True),
 ('a|', 'a', True),
 ('a|', 'b', False),
 ('a|', 'aa', False),
 ('a|', 'ab', False),
 ('a|', 'ba', False),
 ('a|', 'bb', F

In [72]:
dataset_size_bound(5, 10, "abcde", "|*()")

810913069330

In [71]:
len(list(dataset_generator(2, 10, "abc", "|*()")))

2214325

## Writing Dataset

### Padded Dataset

In [77]:
max_regex_len = 2
max_string_len = 4
alphabet = "ab"
regex_chars = "|*"

In [78]:
def embed(string, embedding, length, pad_char):
    padded_string = string.ljust(length, pad_char)[:length]
    return [embedding[char] for char in padded_string]

In [82]:
pad_char = "_"
embedding = dict(map(reversed, enumerate(pad_char + alphabet + regex_chars)))
regexes, strings, labels = zip(*dataset_generator(max_regex_len, max_string_len, alphabet, regex_chars))

X_r = np.array([embed(string, embedding, max_regex_len, pad_char) for string in regexes])
X_s = np.array([embed(string, embedding, max_string_len, pad_char) for string in strings])
y = np.array([int(b) for b in labels])

In [89]:
config = "test"
np.save(config + "_X_r", X_r)
np.save(config + "_X_s", X_s)
np.save(config + "_y", y)