# SFILES Parser

The purpose of this notebook is to explore methods for parsing and processing SFILES text.

## SFILES Notation

SFILES strings are read left to right. Process groups are delimited with an left parenthesis "(" and a terminal right parenthesis ")", and are not nested. The contents of a process group consists of series of strings corresponding to process streams separated by a forward slash "/". Process streams consist of an optional type designated by a sequence of lower case alphabetic characters, followed by a sequence of one or more upper case alphabetic characters denoting chemical species.

## SFILES Validator

### Process Types

In [None]:
types = {
    'i'    : {'name': 'input'},
    'o'    : {'name': 'output'},
    'f'    : {'name': 'flash'},
    'e'    : {'name': ''},
    'm'    : {'name': ''},
    'n'    : {'name': ''},
    'p'    : {'name': 'reactor product'},
    'cyc'  : {'name': 'solvent based azeotropic distillation'}, 
    'r'    : {'name': 'reactor'},
    'sw'   : {'name': 'pressure swing distillation'},
    'pms'  : {'name': 'polar molecule sieve based separation'},
    'ms'   : {'name': 'molecular sieve based separation'},
    'lmem' : {'name': 'liquid membrane based separation'},
    'gmem' : {'name': 'gas membrane based separation'},
    'crs'  : {'name': 'crystallization'},
    'ab'   : {'name': 'absorption'}
}

### Parsing Rules

In [None]:
from pyparsing import Literal, Word, Group, Suppress
from pyparsing import Optional, OneOrMore, ZeroOrMore, oneOf
from pyparsing import alphas, nums

LPAR  = Suppress("(")
RPAR  = Suppress(")")
SLASH = Suppress("/")
GT = Literal(">")
LT = Literal("<")

# components and mixtures
component = Word(alphas.upper(), exact=1)
mixture = Group(OneOrMore(component))

# first unit and stream in a process group
type = Optional(oneOf(' '.join(types.keys())), default='dist')
stream = Group(type + mixture)

# subsequent units and streams in a process group
type_ = Optional(oneOf(' '.join(types.keys())), default='s')
stream_ = Group(type_ + mixture)

# process group
processgroup = LPAR + stream + ZeroOrMore(SLASH + stream_) + RPAR
           
# process sequence is comprised of connectors, process group, and recycles                                             
connector = Optional(GT | LT, default=GT)
recycle = Word(nums, exact=1)
sequence = processgroup + ZeroOrMore(connector + (processgroup | recycle ))

# nested branches
branchsequence = OneOrMore(connector + (processgroup | recycle ))
branch = nestedExpr(opener="[", closer="]", content=branchsequence)

# sfiles expression
sfiles = sequence + ZeroOrMore(branch | sequence)

In [93]:

results = sfiles.parseString('(A/BC)').asList()
results

[['dist', ['A']], ['s', ['B', 'C']]]

## Tests

In [69]:
processgroup.runTests("""\
    (A/BC)
    (ABC/DE)
    (cycA/B)
    (fABC/BCD)
    (rABC/nE/pABCD)
    (rABC/nE/pABCD)
    (swA/B)
    (pmsABC/D)
    (msABC/D)
    (lmemABC/D)
    (gmemABC/D)
    (crsABC/D)
    (abEAB/eF/EABF/EF)
    (iABCD)
    (oABD)
    (ABC/D)
    (rAB/pABD)
""")


(A/BC)
[['dist', ['A']], ['s', ['B', 'C']]]
[0]:
  ['dist', ['A']]
  [0]:
    dist
  [1]:
    ['A']
[1]:
  ['s', ['B', 'C']]
  [0]:
    s
  [1]:
    ['B', 'C']


(ABC/DE)
[['dist', ['A', 'B', 'C']], ['s', ['D', 'E']]]
[0]:
  ['dist', ['A', 'B', 'C']]
  [0]:
    dist
  [1]:
    ['A', 'B', 'C']
[1]:
  ['s', ['D', 'E']]
  [0]:
    s
  [1]:
    ['D', 'E']


(cycA/B)
[['cyc', ['A']], ['s', ['B']]]
[0]:
  ['cyc', ['A']]
  [0]:
    cyc
  [1]:
    ['A']
[1]:
  ['s', ['B']]
  [0]:
    s
  [1]:
    ['B']


(fABC/BCD)
[['f', ['A', 'B', 'C']], ['s', ['B', 'C', 'D']]]
[0]:
  ['f', ['A', 'B', 'C']]
  [0]:
    f
  [1]:
    ['A', 'B', 'C']
[1]:
  ['s', ['B', 'C', 'D']]
  [0]:
    s
  [1]:
    ['B', 'C', 'D']


(rABC/nE/pABCD)
[['r', ['A', 'B', 'C']], ['n', ['E']], ['p', ['A', 'B', 'C', 'D']]]
[0]:
  ['r', ['A', 'B', 'C']]
  [0]:
    r
  [1]:
    ['A', 'B', 'C']
[1]:
  ['n', ['E']]
  [0]:
    n
  [1]:
    ['E']
[2]:
  ['p', ['A', 'B', 'C', 'D']]
  [0]:
    p
  [1]:
    ['A', 'B', 'C', 'D']


(rABC/nE/

(True,
 [('(A/BC)',
   ([(['dist', (['A'], {})], {}), (['s', (['B', 'C'], {})], {})], {})),
  ('(ABC/DE)',
   ([(['dist', (['A', 'B', 'C'], {})], {}), (['s', (['D', 'E'], {})], {})], {})),
  ('(cycA/B)', ([(['cyc', (['A'], {})], {}), (['s', (['B'], {})], {})], {})),
  ('(fABC/BCD)',
   ([(['f', (['A', 'B', 'C'], {})], {}), (['s', (['B', 'C', 'D'], {})], {})], {})),
  ('(rABC/nE/pABCD)',
   ([(['r', (['A', 'B', 'C'], {})], {}), (['n', (['E'], {})], {}), (['p', (['A', 'B', 'C', 'D'], {})], {})], {})),
  ('(rABC/nE/pABCD)',
   ([(['r', (['A', 'B', 'C'], {})], {}), (['n', (['E'], {})], {}), (['p', (['A', 'B', 'C', 'D'], {})], {})], {})),
  ('(swA/B)', ([(['sw', (['A'], {})], {}), (['s', (['B'], {})], {})], {})),
  ('(pmsABC/D)',
   ([(['pms', (['A', 'B', 'C'], {})], {}), (['s', (['D'], {})], {})], {})),
  ('(msABC/D)',
   ([(['ms', (['A', 'B', 'C'], {})], {}), (['s', (['D'], {})], {})], {})),
  ('(lmemABC/D)',
   ([(['lmem', (['A', 'B', 'C'], {})], {}), (['s', (['D'], {})], {})], {})),
  (

In [56]:
sequence.runTests("""\
    (iA)(rAB/pABCD)
    (iA)(oB)(iC)
    (iA)<(oB)>2
    (A/BC)2(oD)
    (A/BC)1
""")


(iA)(rAB/pABCD)
[['i', ['A']], ">", ['r', ['A', 'B']], ['p', ['A', 'B', 'C', 'D']]]
[0]:
  ['i', ['A']]
  - type: 'i'
[1]:
  ">"
[2]:
  ['r', ['A', 'B']]
  - type: 'r'
[3]:
  ['p', ['A', 'B', 'C', 'D']]
  - type: 'p'


(iA)(oB)(iC)
[['i', ['A']], ">", ['o', ['B']], ">", ['i', ['C']]]
[0]:
  ['i', ['A']]
  - type: 'i'
[1]:
  ">"
[2]:
  ['o', ['B']]
  - type: 'o'
[3]:
  ">"
[4]:
  ['i', ['C']]
  - type: 'i'


(iA)<(oB)>2
[['i', ['A']], '<', ['o', ['B']], '>', '2']
[0]:
  ['i', ['A']]
  - type: 'i'
[1]:
  <
[2]:
  ['o', ['B']]
  - type: 'o'
[3]:
  >
[4]:
  2


(A/BC)2(oD)
[['dist', ['A']], ['s', ['B', 'C']], ">", '2', ">", ['o', ['D']]]
[0]:
  ['dist', ['A']]
  - type: 'dist'
[1]:
  ['s', ['B', 'C']]
  - type: 's'
[2]:
  ">"
[3]:
  2
[4]:
  ">"
[5]:
  ['o', ['D']]
  - type: 'o'


(A/BC)1
[['dist', ['A']], ['s', ['B', 'C']], ">", '1']
[0]:
  ['dist', ['A']]
  - type: 'dist'
[1]:
  ['s', ['B', 'C']]
  - type: 's'
[2]:
  ">"
[3]:
  1



(True,
 [('(iA)(rAB/pABCD)',
   ([(['i', (['A'], {})], {'type': ['i']}), ">", (['r', (['A', 'B'], {})], {'type': ['r']}), (['p', (['A', 'B', 'C', 'D'], {})], {'type': ['p']})], {})),
  ('(iA)(oB)(iC)',
   ([(['i', (['A'], {})], {'type': ['i']}), ">", (['o', (['B'], {})], {'type': ['o']}), ">", (['i', (['C'], {})], {'type': ['i']})], {})),
  ('(iA)<(oB)>2',
   ([(['i', (['A'], {})], {'type': ['i']}), '<', (['o', (['B'], {})], {'type': ['o']}), '>', '2'], {})),
  ('(A/BC)2(oD)',
   ([(['dist', (['A'], {})], {'type': ['dist']}), (['s', (['B', 'C'], {})], {'type': ['s']}), ">", '2', ">", (['o', (['D'], {})], {'type': ['o']})], {})),
  ('(A/BC)1',
   ([(['dist', (['A'], {})], {'type': ['dist']}), (['s', (['B', 'C'], {})], {'type': ['s']}), ">", '1'], {}))])

In [57]:
branch.runTests("""\
    [(oA)]
    [<(oD)]
    [<(A/BD)]
    [(A/BD)(BD/B)[(oA)]]
""")


[(oA)]
[[">", ['o', ['A']]]]
[0]:
  [">", ['o', ['A']]]
  [0]:
    ">"
  [1]:
    ['o', ['A']]
    - type: 'o'


[<(oD)]
[['<', ['o', ['D']]]]
[0]:
  ['<', ['o', ['D']]]
  [0]:
    <
  [1]:
    ['o', ['D']]
    - type: 'o'


[<(A/BD)]
[['<', ['dist', ['A']], ['s', ['B', 'D']]]]
[0]:
  ['<', ['dist', ['A']], ['s', ['B', 'D']]]
  [0]:
    <
  [1]:
    ['dist', ['A']]
    - type: 'dist'
  [2]:
    ['s', ['B', 'D']]
    - type: 's'


[(A/BD)(BD/B)[(oA)]]
[[">", ['dist', ['A']], ['s', ['B', 'D']], ">", ['dist', ['B', 'D']], ['s', ['B']], [">", ['o', ['A']]]]]
[0]:
  [">", ['dist', ['A']], ['s', ['B', 'D']], ">", ['dist', ['B', 'D']], ['s', ['B']], [">", ['o', ['A']]]]
  [0]:
    ">"
  [1]:
    ['dist', ['A']]
    - type: 'dist'
  [2]:
    ['s', ['B', 'D']]
    - type: 's'
  [3]:
    ">"
  [4]:
    ['dist', ['B', 'D']]
    - type: 'dist'
  [5]:
    ['s', ['B']]
    - type: 's'
  [6]:
    [">", ['o', ['A']]]
    [0]:
      ">"
    [1]:
      ['o', ['A']]
      - type: 'o'



(True,
 [('[(oA)]', ([([">", (['o', (['A'], {})], {'type': ['o']})], {})], {})),
  ('[<(oD)]', ([(['<', (['o', (['D'], {})], {'type': ['o']})], {})], {})),
  ('[<(A/BD)]',
   ([(['<', (['dist', (['A'], {})], {'type': ['dist']}), (['s', (['B', 'D'], {})], {'type': ['s']})], {})], {})),
  ('[(A/BD)(BD/B)[(oA)]]',
   ([([">", (['dist', (['A'], {})], {'type': ['dist']}), (['s', (['B', 'D'], {})], {'type': ['s']}), ">", (['dist', (['B', 'D'], {})], {'type': ['dist']}), (['s', (['B'], {})], {'type': ['s']}), ([">", (['o', (['A'], {})], {'type': ['o']})], {})], {})], {}))])

In [58]:
sfiles.runTests("""\
    (iA)(rAB/pABCD)<1<2[<(iB)](mABC/D)[<(oD)](A/BC)1(cycB/C)2(oC)
    (iB)(rAB/pABCD)<1<2[<(iA)](mABC/D)[<(oD)](A/BC)1(cycB/C)2(oC)
    (iA)(rAB/pABCD)[(iB)]    
    (iA)(rAB/pABCD)[<(iB)]
    (iA)(rAB/pABCD)<1<2[<(iB)](mABC/D)[<(oD)](A/BC)1(cycB/C)2(oC)
    (iB)(rAB/pABCD)<1<2[<(iA)](mABC/D)[<(oD)](A/BC)1(cycB/C)2(oC)
    (iA)(rAB/pABCD)[<(iB)](mABC/D)[(oD)](A/BC)[(oA)](oBC)
    (iA)[(oB)(oC)[(oD)]]
    (iABCDE)(AB/CDE)[(A/B)[(oA)](oB)]
""")


(iA)(rAB/pABCD)<1<2[<(iB)](mABC/D)[<(oD)](A/BC)1(cycB/C)2(oC)
[['i', ['A']], ">", ['r', ['A', 'B']], ['p', ['A', 'B', 'C', 'D']], '<', '1', '<', '2', ['<', ['i', ['B']]], ['m', ['A', 'B', 'C']], ['s', ['D']], ['<', ['o', ['D']]], ['dist', ['A']], ['s', ['B', 'C']], ">", '1', ">", ['cyc', ['B']], ['s', ['C']], ">", '2', ">", ['o', ['C']]]
[0]:
  ['i', ['A']]
  - type: 'i'
[1]:
  ">"
[2]:
  ['r', ['A', 'B']]
  - type: 'r'
[3]:
  ['p', ['A', 'B', 'C', 'D']]
  - type: 'p'
[4]:
  <
[5]:
  1
[6]:
  <
[7]:
  2
[8]:
  ['<', ['i', ['B']]]
  [0]:
    <
  [1]:
    ['i', ['B']]
    - type: 'i'
[9]:
  ['m', ['A', 'B', 'C']]
  - type: 'm'
[10]:
  ['s', ['D']]
  - type: 's'
[11]:
  ['<', ['o', ['D']]]
  [0]:
    <
  [1]:
    ['o', ['D']]
    - type: 'o'
[12]:
  ['dist', ['A']]
  - type: 'dist'
[13]:
  ['s', ['B', 'C']]
  - type: 's'
[14]:
  ">"
[15]:
  1
[16]:
  ">"
[17]:
  ['cyc', ['B']]
  - type: 'cyc'
[18]:
  ['s', ['C']]
  - type: 's'
[19]:
  ">"
[20]:
  2
[21]:
  ">"
[22]:
  ['o', ['C']]
  - ty

(True,
 [('(iA)(rAB/pABCD)<1<2[<(iB)](mABC/D)[<(oD)](A/BC)1(cycB/C)2(oC)',
   ([(['i', (['A'], {})], {'type': ['i']}), ">", (['r', (['A', 'B'], {})], {'type': ['r']}), (['p', (['A', 'B', 'C', 'D'], {})], {'type': ['p']}), '<', '1', '<', '2', (['<', (['i', (['B'], {})], {'type': ['i']})], {}), (['m', (['A', 'B', 'C'], {})], {'type': ['m']}), (['s', (['D'], {})], {'type': ['s']}), (['<', (['o', (['D'], {})], {'type': ['o']})], {}), (['dist', (['A'], {})], {'type': ['dist']}), (['s', (['B', 'C'], {})], {'type': ['s']}), ">", '1', ">", (['cyc', (['B'], {})], {'type': ['cyc']}), (['s', (['C'], {})], {'type': ['s']}), ">", '2', ">", (['o', (['C'], {})], {'type': ['o']})], {})),
  ('(iB)(rAB/pABCD)<1<2[<(iA)](mABC/D)[<(oD)](A/BC)1(cycB/C)2(oC)',
   ([(['i', (['B'], {})], {'type': ['i']}), ">", (['r', (['A', 'B'], {})], {'type': ['r']}), (['p', (['A', 'B', 'C', 'D'], {})], {'type': ['p']}), '<', '1', '<', '2', (['<', (['i', (['A'], {})], {'type': ['i']})], {}), (['m', (['A', 'B', 'C'], {})], {

## Flowsheets

A simplified version of SFILES is sometimes useful for describing the layout of flowsheets with the detail of describing process groups. 


In [None]:
from pyparsing import Literal, Word, Group
from pyparsing import Optional, OneOrMore, ZeroOrMore, oneOf
from pyparsing import alphas, nums
from pyparsing import nestedExpr

# process group
processgroup = Word(alphas.upper(), exact=1)
           
# process sequence is comprised of connectors, process group, and recycles                                             
connector = Optional(Literal('>') | Literal('<'), default=Literal('>'))
recycle = Word(nums, exact=1)
sequence = OneOrMore(connector + (processgroup | recycle ))

# nested branches
branch = nestedExpr(opener="[", closer="]", content=sequence)
branch.setName('branch')

# sfiles expression
sfiles = sequence + ZeroOrMore(branch | sequence)

In [37]:
result = sfiles.parseString('OA<1E[P]F[Q]G<2MN[R]J<3K[IH3]L[S]D2CB1')
result

([">", 'O', ">", 'A', '<', '1', ">", 'E', ([">", 'P'], {}), ">", 'F', ([">", 'Q'], {}), ">", 'G', '<', '2', ">", 'M', ">", 'N', ([">", 'R'], {}), ">", 'J', '<', '3', ">", 'K', ([">", 'I', ">", 'H', ">", '3'], {}), ">", 'L', ([">", 'S'], {}), ">", 'D', ">", '2', ">", 'C', ">", 'B', ">", '1'], {})

In [None]:
fo