In [1]:
cd ../

C:\Users\yulya\PycharmProjects\machine_learning_of_patterns


In [2]:
import time
import numpy as np
from typing import Any, Tuple

In [3]:
from regex.parser import RegexParser
from regex.regex import *

In [4]:
with open("datasets/dataset.csv", "r") as f:
    REGEXES = f.read().split("\n")[:-1]

## Parentheses expansion

In [5]:
tests = [
    ["((K*|K*|(K*)*)*)*", "(K*|K*|K**)**"],
    ["(i((j))*)*|t*", "(ij*)*|t*"],
    ["((asssdfa|bbbssad|B*))*", "(asssdfa|bbbssad|B*)*"],
    ["((a|B*))*", "(a|B*)*"],
    ["((a*)*)*", "a***"],
    ["s*((l|B*)oF*)*", "s*((l|B*)oF*)*"],
    ["(QJ*|q|(M(un)|B)*|C)", "QJ*|q|(Mun|B)*|C"],
    ["((v*))*", "v**"],
    ["(abnm)*", "(abnm)*"],
    ["a|((a(a*)*)a|a(a)*)*", "a|(aa**a|aa*)*"]
]

In [6]:
for test in tests:
    res = RegexParser(test[0], [100] * 20).parse()
    if test[1] != str(res):
        res.plot().render("visualization/test.gv", format="png")
        print(f"Test {test[0]}. Expected: {test[1]}, result: {res}")
        break

### Dataset

In [17]:
BASE_REGEXES = []

In [18]:
with open("datasets/base_dataset.csv", "w") as f:  
    for regex in REGEXES:
        BASE_REGEXES.append(RegexParser(regex, [100] * 100).parse())
        f.write(str(BASE_REGEXES[-1]) + "\n")

## First approximation 

In [9]:
def get_approximation(str_regex: str) -> str:
    
    def approximate(regex: Regex, height: int = 0) -> Tuple[Optional[Regex], int]:
        if isinstance(regex, BaseRegex):
            return regex, height
        if isinstance(regex, AlternativeRegex) or isinstance(regex, ConcatenationRegex):
            apps = [approximate(value, height) for value in regex.value]
            return apps[np.argmax([app[1] for app in apps])]
        if isinstance(regex, StarRegex):
            height += 1
            if isinstance(regex.value, BaseRegex):
                return regex, height
            apps = [approximate(value, height) for value in regex.value.value]
            max_idx = np.argmax([app[1] for app in apps])
            return StarRegex(apps[max_idx][0], 1), apps[max_idx][1]
    
    regex = RegexParser(str_regex, [1] * len(str_regex)).parse()
    approximation = approximate(regex)
    return str(approximation[0])

In [10]:
tests = [
    ["jd*", "d*"],
    ["(R*(o|l)(i*|g)*(h)*h)*", "i***"],
    ["z(q*|k*O*|HL*)*", "q**"],
    ["o(r*(H*j*|D*))*", "r**"],
    ["x*h((Q*|P))*", "Q**"]
]

In [11]:
for test in tests:
    app = get_approximation(test[0])
    if test[1] != str(app):
        res.plot().render("visualization/test.gv", format="png")
        print(f"Test {test[0]}. Expected: {test[1]}, result: {app}")
        break

### Dataset

In [12]:
APPROX_REGEXES = []

In [13]:
with open("datasets/approx_dataset.csv", "w") as f:  
    for regex in REGEXES:
        APPROX_REGEXES.append(get_approximation(regex))
        f.write(str(APPROX_REGEXES[-1]) + "\n")

## Learning

In [21]:
import pandas as pd

In [14]:
from pattern.pattern import NEVariable, NEPattern

python main.py -d datasets/approx_dataset.csv -o

Length=2: 100%|██████████████████████████████████████████| 2/2 [00:00<?, ?it/s, pattern=f\*]

Length=3: 100%|██████████████████████████████████████████| 37/37 [00:00<?, ?it/s, pattern=x1\*\*]

Length=4: 100%|██████████████████████████████████████████| 26/26 [00:00<?, ?it/s, pattern=x1\*\*\*]

Length=5: 100%|██████████████████████████████████████████| 12/12 [00:00<00:00, 768.04it/s, pattern=x1\*\*\*\*] 

Length=6: 100%|██████████████████████████████████████████| 3/3 [00:00<?, ?it/s, pattern=x1\*\*\*\*\*]


Result: x1\*

In [16]:
result = NEPattern([NEVariable(), "*"])

In [19]:
data = {
    "regex": [],
    "match": []
}

for regex in BASE_REGEXES:
    data["regex"].append(str(regex))
    data["match"].append(result.match(str(regex)))

In [22]:
df = pd.DataFrame(data)

In [24]:
df[~df["match"]]

Unnamed: 0,regex,match
13,(ii*i*)**Bf,False
18,(r(k(u*d*)*)*O)*Z,False
21,QJ*|q|(M(un)*|B)*|C,False
24,(o*|I|dC)**L*|F,False
27,YN(l**|u*)a(Hc|E*)|l,False
28,((wN*O*d)**|l)*X,False
34,W*|(eU|o**M)*Z,False
60,T*p|(A|A**)*|A,False
63,(NU|DZ)***S,False
70,(I*K*)*m,False


In [25]:
df.to_csv("experiments/first_filtering.csv", index=False)