# Regular Expression Dataset Generation

## Utilities
### Definitions

In [51]:
import re
from itertools import product
from string import ascii_letters
import csv
import keras

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [4]:
def all_strings(max_length, alphabet):
    """Returns a generator of all strings up to a given length from an alphabet"""
    for length in range(max_length + 1):
        for s in product(alphabet, repeat=length):
            yield ''.join(s)

In [5]:
def extract_valid_regexes(candidates):
    """Takes a generator of strings and returns a generator of compiled valid regexes"""
    for c in candidates:
        try:
            yield re.compile(c)
        except:
            pass

In [6]:
def all_regexes(max_length, alphabet):
    """Returns a generator of all valid regexes up to a length from an alphabet"""
    return extract_valid_regexes(all_strings(max_length, alphabet))

### Examples

In [7]:
print(list(all_strings(2, "ab")))
print(list(all_regexes(3, "ab|*"))[:6])

['', 'a', 'b', 'aa', 'ab', 'ba', 'bb']
[re.compile(''), re.compile('a'), re.compile('b'), re.compile('|'), re.compile('aa'), re.compile('ab')]


## Dataset Generation
### Definitions

In [8]:
def regex_apply_all(regex, max_length, alphabet):
    """Returns a generator that takes a compiled regex and yields
    tuples for every string of length from alphabet with whether
    or not it matches the regex"""
    for string in all_strings(max_length, alphabet):
        yield (string, bool(regex.fullmatch(string)))

In [9]:
def dataset_size_bound(regex_max_length, string_max_length, alphabet, regex_chars):
    """Returns an upper bound on the size of a dataset generated with the same parameters"""
    count_regexes = sum(len(alphabet + regex_chars) ** length for length in range(regex_max_length + 1))
    count_strings = sum(len(alphabet) ** length for length in range(string_max_length + 1))
    return count_regexes * count_strings

In [40]:
def dataset_generator(regex_max_length, string_max_length, alphabet, regex_chars):
    """Returns a generator that gives for every regex-string pair
    up to a length whether or not they match"""
    for regex in all_regexes(regex_max_length, alphabet + regex_chars):
        for string in all_strings(string_max_length, alphabet):
            yield regex.pattern, string, bool(regex.fullmatch(string))

### Examples

In [11]:
list(regex_apply_all(re.compile(r'a|aa'), 2, "ab"))

[('', False),
 ('a', True),
 ('b', False),
 ('aa', True),
 ('ab', False),
 ('ba', False),
 ('bb', False)]

In [12]:
list(dataset_generator(3, 2, "ab", "|"))

[('', '', True),
 ('', 'a', False),
 ('', 'b', False),
 ('', 'aa', False),
 ('', 'ab', False),
 ('', 'ba', False),
 ('', 'bb', False),
 ('a', '', False),
 ('a', 'a', True),
 ('a', 'b', False),
 ('a', 'aa', False),
 ('a', 'ab', False),
 ('a', 'ba', False),
 ('a', 'bb', False),
 ('b', '', False),
 ('b', 'a', False),
 ('b', 'b', True),
 ('b', 'aa', False),
 ('b', 'ab', False),
 ('b', 'ba', False),
 ('b', 'bb', False),
 ('|', '', True),
 ('|', 'a', False),
 ('|', 'b', False),
 ('|', 'aa', False),
 ('|', 'ab', False),
 ('|', 'ba', False),
 ('|', 'bb', False),
 ('aa', '', False),
 ('aa', 'a', False),
 ('aa', 'b', False),
 ('aa', 'aa', True),
 ('aa', 'ab', False),
 ('aa', 'ba', False),
 ('aa', 'bb', False),
 ('ab', '', False),
 ('ab', 'a', False),
 ('ab', 'b', False),
 ('ab', 'aa', False),
 ('ab', 'ab', True),
 ('ab', 'ba', False),
 ('ab', 'bb', False),
 ('a|', '', True),
 ('a|', 'a', True),
 ('a|', 'b', False),
 ('a|', 'aa', False),
 ('a|', 'ab', False),
 ('a|', 'ba', False),
 ('a|', 'bb', F

In [13]:
dataset_size_bound(3, 10, "abc", "|*()")

35429200

In [14]:
len(list(dataset_generator(2, 10, "abc", "|*()")))

2214325

## Writing Dataset

In [49]:
max_regex_len = 2
max_string_len = 2
alphabet = "abc"
regex_chars = "|*()"

### Padded Dataset

In [52]:
pad_char = "_"
embedding = dict(map(reversed, enumerate(pad_char + alphabet + regex_chars)))

In [59]:
def embed(string, embedding, length, pad_char):
    padded_string = string.ljust(length, pad_char)[:length]
    return [embedding[char] for char in padded_string]

In [60]:
regexes, strings, labels = zip(*dataset_generator(2, 2, "abc", "|*()"))

X_r = [embed(string, embedding, max_regex_len, pad_char) for string in regexes]
X_s = [embed(string, embedding, max_string_len, pad_char) for string in strings]
y = [int(b) for b in y]

In [61]:
X_r

[[0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [2, 0],
 [2, 0],
 [2, 0],
 [2, 0],
 [2, 0],
 [2, 0],
 [2, 0],
 [2, 0],
 [2, 0],
 [2, 0],
 [2, 0],
 [2, 0],
 [2, 0],
 [3, 0],
 [3, 0],
 [3, 0],
 [3, 0],
 [3, 0],
 [3, 0],
 [3, 0],
 [3, 0],
 [3, 0],
 [3, 0],
 [3, 0],
 [3, 0],
 [3, 0],
 [4, 0],
 [4, 0],
 [4, 0],
 [4, 0],
 [4, 0],
 [4, 0],
 [4, 0],
 [4, 0],
 [4, 0],
 [4, 0],
 [4, 0],
 [4, 0],
 [4, 0],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 2],
 [1, 2],
 [1, 2],
 [1, 2],
 [1, 2],
 [1, 2],
 [1, 2],
 [1, 2],
 [1, 2],
 [1, 2],
 [1, 2],
 [1, 2],
 [1, 2],
 [1, 3],
 [1, 3],
 [1, 3],
 [1, 3],
 [1, 3],
 [1, 3],
 [1, 3],
 [1, 3],
 [1, 3],
 [1, 3],
 [1, 3],
 [1, 3],
 [1, 3],
 [1, 4],
 [1, 4],
 [1, 4],
 [1, 4],
 [1, 4],
 [1, 4],
 [1, 4],
 

### Deprecated: Writing CSV

In [67]:
ex = re.compile(r'(a+)b\1b\1')

In [68]:
print(ex.fullmatch('aaabaaabaaa'))

<_sre.SRE_Match object; span=(0, 11), match='aaabaaabaaa'>
