In [1]:
cd ../

C:\Users\yulya\PycharmProjects\machine_learning_of_patterns


In [2]:
import time
import numpy as np
import string
import pandas as pd
import matplotlib.pyplot as plt
from random import randint, random
from typing import Callable, Any, Tuple, List
from tqdm import tqdm

In [3]:
from regex.generator import RegexGenerator
from regex.parser import RegexParser

In [4]:
def generate_stateful_regex(
        length: int,
        star_num: int, 
        star_nesting: int, 
        alphabet_size: int = 26) -> str:
    gen = RegexGenerator(length, star_num, star_nesting, alphabet_size)
    regex = ""
    while len(regex) != length:
        regex = gen.generate_regex()
    return regex



def generate_word(
        length: int, 
        alphabet_size: int = 26) -> str:
    alphabet = list(string.ascii_lowercase)[:alphabet_size] + \
               list(string.ascii_uppercase)[:alphabet_size]
    word = ''.join([alphabet[randint(0, len(alphabet) - 1)] for _ in range(length)])
    return word
    


def measure_time(func: Callable, *params) -> Tuple: # in seconds
    t1 = time.perf_counter()
    result = func(*params)
    t2 = time.perf_counter()
    return result, t2 - t1


def count_items_in_regex(regex: str, alphabet_size: int = 26) -> Tuple[int]:
    stars = 0
    letters = 0
    alphabet = list(string.ascii_lowercase)[:alphabet_size] + \
               list(string.ascii_uppercase)[:alphabet_size]
    for c in regex:
        if c in alphabet:
            letters += 1
        elif c == '*':
            stars += 1
    return letters, stars

In [5]:
regex_length = [i for i in range(5, 31, 5)]
alphabet_size = 26

In [7]:
results = {
    "regex": [],
    "length": [],
    "word": [],
    "matching_type": [],
    "time": [],
    "match": []
}

for length in regex_length:
    star_nesting = star_num = length // 2
    regex_str = generate_stateful_regex(length, star_num, star_nesting)
    letters, stars = count_items_in_regex(regex_str)
    word = generate_word(letters ** stars)
    
    parser = RegexParser(regex_str, [100] * star_num)
    regex = parser.parse()
    nfa = regex.to_nfa()
    dfa = nfa.to_dfa()
    
    data = {
        "regex": regex,
        "nfa": nfa,
        "dfa": dfa
    }
    for k, v in data.items():
        match, t = measure_time(v.match, word)
        results["regex"].append(regex_str)
        results["length"].append(len(regex_str))
        results["word"].append(word)
        results["matching_type"].append(k)
        results["time"].append(t)
        results["match"].append(match)



In [10]:
df = pd.DataFrame(data=results)

In [11]:
df

Unnamed: 0,regex,length,word,matching_type,time,match
0,P*|L*,5,Gncf,regex,2.6e-05,False
1,P*|L*,5,Gncf,nfa,2.9e-05,False
2,P*|L*,5,Gncf,dfa,6e-06,False
3,pY*(U*)O*i,10,baTnSDNnQuMEtmsemWuQwYzetfzrbNhwEMAdPzJTgVMBuE...,regex,1.8e-05,False
4,pY*(U*)O*i,10,baTnSDNnQuMEtmsemWuQwYzetfzrbNhwEMAdPzJTgVMBuE...,nfa,2.7e-05,False
5,pY*(U*)O*i,10,baTnSDNnQuMEtmsemWuQwYzetfzrbNhwEMAdPzJTgVMBuE...,dfa,1.6e-05,False
6,(z*|V|(b*))*(a),15,eIhoEkyYMNNMftiIbIGZzjHQJEeQbYOSDedUuuxloaTwBn...,regex,6.9e-05,False
7,(z*|V|(b*))*(a),15,eIhoEkyYMNNMftiIbIGZzjHQJEeQbYOSDedUuuxloaTwBn...,nfa,4.1e-05,False
8,(z*|V|(b*))*(a),15,eIhoEkyYMNNMftiIbIGZzjHQJEeQbYOSDedUuuxloaTwBn...,dfa,7e-06,False
9,(E|A)i|(h*g*QlN(R))*,20,ccOQkcHmtjjeZlllnghaxzfiszEuAmPQNzCRCzNBmfngvd...,regex,4.1e-05,False


In [12]:
df.to_csv('./experiments/matching_time.csv', index=False)