In [None]:
from IPython.core.display import HTML
with open('../style.css', 'r') as file:
    css = file.read()
HTML(css)

# From Regular Expressions to <span style="font-variant:small-caps;">Fsm</span>s

This notebook shows how a given regular expression $r$ can be transformed into an equivalent finite state machine. 
It implements the theory that is outlined in section 4.4. of the 
lecture notes.

The class `RegExp2NFA` administers two member variables:
- `Sigma` is the <em style="color:blue">alphabet</em>, i.e. the set of characters used.
- `StateCount` is a counter that is needed to create <em style="color:blue">unique</em> state names.

In [None]:
class RegExp2NFA:
    def __init__(self, Sigma):
        self.Sigma      = Sigma
        self.StateCount = 0

The member function `toNFA` takes an object `self` of class `RegExp2NFA` and a regular expression `r` and returns a finite state machine 
that accepts the same language as described by `r`.  The regular expression is represented in `Python` as follows:
- The regular expression $\emptyset$ is represented as the number `0`.
- The regular expression $\varepsilon$ is represented as the empty string `''`.
- The regular expression $c$ that matches the character $c$ is represented by the character $c$.
- The regular expression $r_1 \cdot r_2$  is represented by the triple $\bigl(\texttt{'cat'}, \texttt{repr}(r_1), \texttt{repr}(r_2)\bigr)$.

  Here, and in the following, for a given regular expression $r$ the expression $\texttt{repr}(r)$ denotes the `Python` representation of the regular 
  expressions  $r$.
- The regular expression $r_1 + r_2$  is represented by the triple $\bigl(\texttt{'or'}, \texttt{repr}(r_1), \texttt{repr}(r_2)\bigr)$.
- The regular expression $r^*$  is represented by the pair $\bigl(\texttt{'star'}, \texttt{repr}(r)\bigr)$.

In [None]:
def toNFA(self, r):
    if r == 0: 
        return self.genEmptyNFA()
    if r == '': 
        return self.genEpsilonNFA()
    if isinstance(r, str) and len(r) == 1: 
        return self.genCharNFA(r)
    if r[0] == 'cat':
        return self.catenate(self.toNFA(r[1]), self.toNFA(r[2]))
    if r[0] == 'or':
        return self.disjunction(self.toNFA(r[1]), self.toNFA(r[2]))
    if r[0] == 'star':
        return self.kleene(self.toNFA(r[1]))
    raise ValueError(f'{r} is not a proper regular expression.')
    
RegExp2NFA.toNFA = toNFA
del toNFA

The <span style="font-variant:small-caps;">Fsm</span> `genEmptyNFA()` is defined as
$$\bigl\langle \{ q_0, q_1 \}, \Sigma, \{\}, q_0, \{ q_1 \} \bigr\rangle. $$
Note that this <span style="font-variant:small-caps;">Fsm</span> has no transitions at all.

In [None]:
def genEmptyNFA(self):
        q0 = self.getNewState()
        q1 = self.getNewState()
        delta = {}
        return {q0, q1}, self.Sigma, delta, q0, { q1 }

RegExp2NFA.genEmptyNFA = genEmptyNFA
del genEmptyNFA

The <span style="font-variant:small-caps;">Fsm</span> `genEpsilonNFA` is defined as
$$  \bigl\langle \{ q_0, q_1 \}, \Sigma, 
                          \bigl\{ \langle q_0, \varepsilon\rangle \mapsto \{q_1\} \bigr\}, q_0, \{ q_1 \} \bigr\rangle.
$$

In [None]:
def genEpsilonNFA(self):
    q0 = self.getNewState()
    q1 = self.getNewState()
    delta = { (q0, ''): {q1} }
    return {q0, q1}, self.Sigma, delta, q0, { q1 }

RegExp2NFA.genEpsilonNFA = genEpsilonNFA 
del genEpsilonNFA

For a letter $c \in \Sigma$ the <span style="font-variant:small-caps;">Fsm</span> `genCharNFA`$(c)$ is defined as 
$$ A(c) = 
   \bigl\langle \{ q_0, q_1 \}, \Sigma, 
   \bigl\{ \langle q_0, c \rangle \mapsto \{q_1\}\bigr\}, q_0, \{ q_1 \} \bigr\rangle.
$$

In [None]:
def genCharNFA(self, c):
    q0 = self.getNewState()
    q1 = self.getNewState()
    delta = { (q0, c): {q1} } 
    return {q0, q1}, self.Sigma, delta, q0, { q1 }

RegExp2NFA.genCharNFA = genCharNFA
del genCharNFA

Given two <span style="font-variant:small-caps;">Fsm</span>s `f1` and `f2`, the function `catenate(f1, f2)` 
creates an <span style="font-variant:small-caps;">Fsm</span> that recognizes a string $s$ if it can be written 
in the form
$$ s = s_1s_2 $$
and $s_1$ is recognized by `f1` and $s_2$ is recognized by `f2`. 

Assume that $f_1$ and $f_2$ have the following form:
- $f_1 = \langle Q_1, \Sigma, \delta_1, q_1, \{ q_2 \}\rangle$,
- $f_2 = \langle Q_2, \Sigma, \delta_2, q_3, \{ q_4 \}\rangle$,
- $Q_1 \cap Q_2 = \{\}$.
 
Then $\texttt{catenate}(f_1, f_2)$ is defined as:
$$  \bigl\langle Q_1 \cup Q_2, \Sigma, 
   \bigl\{ \langle q_2,\varepsilon\rangle  \mapsto \{q_3\} \bigr\} 
         \cup \delta_1 \cup \delta_2, q_1, \{ q_4 \} \bigr\rangle.
$$

In [None]:
def catenate(self, f1, f2):
    M1, Sigma, delta1, q1, A1 = f1
    M2, Sigma, delta2, q3, A2 = f2
    q2 = arb(A1)
    q4 = arb(A2)
    delta = dict_union(delta1, delta2)
    delta[q2, ''] = {q3}
    return M1 | M2, Sigma, delta, q1, A2

RegExp2NFA.catenate = catenate
del catenate

Given two <span style="font-variant:small-caps;">Fsm</span>s `f1` and `f2`, the function `disjunction(f1, f2)` 
creates an <span style="font-variant:small-caps;">Fsm</span> that recognizes a string $s$ if it is either 
is recognized by `f1` or by `f2`. 

Assume again that the states of 
$f_1$ and $f_2$ are different and that $f_1$ and $f_2$ have the following form:
- $f_1 = \langle Q_1, \Sigma, \delta_1, q_1, \{ q_3 \}\rangle$,
- $f_2 = \langle Q_2, \Sigma, \delta_2, q_2, \{ q_4 \}\rangle$,
- $Q_1 \cap Q_2 = \{\}$.

Then $\texttt{disjunction}(f_1, f_2)$ is defined as follows:
$$ \bigl\langle \{ q_0, q_5 \} \cup Q_1 \cup Q_2, \Sigma, 
                \bigl\{ \langle q_0,\varepsilon\rangle \mapsto \{q_1, q_2\},
                   \langle q_3,\varepsilon\rangle \mapsto \{q_5\}, 
                   \langle q_4,\varepsilon\rangle \mapsto \{q_5\} \bigr\} 
                   \cup \delta_1 \cup \delta_2, q_0, \{ q_5 \} \bigr\rangle
$$

In [None]:
def disjunction(self, f1, f2):
        M1, Sigma, delta1, q1, A1 = f1
        M2, Sigma, delta2, q2, A2 = f2
        q3 = arb(A1)
        q4 = arb(A2)
        q0 = self.getNewState()
        q5 = self.getNewState() 
        delta = dict_union(delta1, delta2)
        delta[q0, ''] = { q1, q2 }
        delta[q3, ''] = { q5 }
        delta[q4, ''] = { q5 }
        return { q0, q5 } | M1 | M2, Sigma, delta, q0, { q5 }
    
RegExp2NFA.disjunction = disjunction
del disjunction

Given an <span style="font-variant:small-caps;">Fsm</span> `f`, the function `kleene(f)` 
creates an <span style="font-variant:small-caps;">Fsm</span> that recognizes a string $s$ if it can be written as
$$ s = s_1 s_2 \cdots s_n $$
and all $s_i$ are recognized by `f`.  Note that $n$ might be $0$. 

If `f` is defined as
$$ f = \langle Q, \Sigma, \delta, q_1, \{ q_2 \} \rangle,
$$
then  `kleene(f)` is defined as follows:
$$ \bigl\langle \{ q_0, q_3 \} \cup Q, \Sigma, 
                \bigl\{ \langle q_0,\varepsilon\rangle \mapsto \{q_1, q_3\},  
                        \langle q_2,\varepsilon\rangle \mapsto \{q_1, q_3\}, \bigr\} 
                \cup \delta, q_0, \{ q_3 \} \bigr\rangle.
$$

In [None]:
def kleene(self, f):
    M, Sigma, delta0, q1, A = f
    q2 = arb(A)
    q0 = self.getNewState()
    q3 = self.getNewState()
    delta = delta0
    delta[q0, ''] = { q1, q3 }
    delta[q2, ''] = { q1, q3 }
    return { q0, q3 } | M, Sigma, delta, q0, { q3 }

RegExp2NFA.kleene = kleene
del kleene

The function `getNewState` returns a new number that has not yet been used as a state.

In [None]:
def getNewState(self):
    self.StateCount += 1
    return self.StateCount

RegExp2NFA.getNewState = getNewState
del getNewState

The function `arb(S)` returns an arbitrary member from the set `S`.

In [None]:
def arb(S):
    for x in S:
        return x

The function `dict_union` takes two dictionaries $d_1$ and $d_2$ as arguments.  It returns a new dictionary $d$ that is defined as follows:
$$
d[k] = \left\{ \begin{array}{ll}
                d_1[k] & \mbox{if $d_1[k]$ is defined and $d_2[k]$ is undefined;} \\
                d_2[k] & \mbox{if $d_2[k]$ is defined.}
               \end{array}
       \right.
$$
The arguments $d_1$ and $d_2$ are left unchanged.

In [None]:
def dict_union(d1, d2):
    return { **d1, **d2 }

In [None]:
A = { 'a': 1, 'c': 3 }
B = { 'b': 2, 'c': 4 }
dict_union(A, B)

In [None]:
A, B

The notebook `Test-Tegexp-2-NFA`can be used to test the functions implemented in this notebook.