# SFILES Parser

The purpose of this notebook is to explore methods for parsing and processing SFILES text.

## SFILES Notation

SFILES strings are read left to right. Process groups are delimited with an left parenthesis "(" and a terminal right parenthesis ")", and are not nested. The contents of a process group consists of series of strings corresponding to process streams separated by a forward slash "/". Process streams consist of an optional type designated by a sequence of lower case alphabetic characters, followed by a sequence of one or more upper case alphabetic characters denoting chemical species.

## SFILES Validator

### Process Types

In [2]:
types = {
    'i'    : {'name': 'input'},
    'o'    : {'name': 'output'},
    'f'    : {'name': 'flash'},
    'e'    : {'name': ''},
    'm'    : {'name': ''},
    'n'    : {'name': ''},
    'p'    : {'name': 'reactor product'},
    'cyc'  : {'name': 'solvent based azeotropic distillation'}, 
    'r'    : {'name': 'reactor'},
    'sw'   : {'name': 'pressure swing distillation'},
    'pms'  : {'name': 'polar molecule sieve based separation'},
    'ms'   : {'name': 'molecular sieve based separation'},
    'lmem' : {'name': 'liquid membrane based separation'},
    'gmem' : {'name': 'gas membrane based separation'},
    'crs'  : {'name': 'crystallization'},
    'ab'   : {'name': 'absorption'}
}

### Parsing Rules

In [106]:
from pyparsing import Literal, Word, Group, Suppress
from pyparsing import Optional, OneOrMore, ZeroOrMore, oneOf, nestedExpr
from pyparsing import alphas, nums

LPAR  = Suppress("(")
RPAR  = Suppress(")")
LBRA  = Literal("[")
RBRA  = Literal("]")
SLASH = Suppress("/")
GT = Literal(">")
LT = Literal("<")

# components and mixtures
component = Word(alphas.upper(), exact=1)
mixture = Group(OneOrMore(component))

# first unit and stream in a process group
type = Optional(oneOf(' '.join(types.keys())), default='dist')
stream = Group(type + mixture)

# subsequent units and streams in a process group
type_ = Optional(oneOf(' '.join(types.keys())), default='s')
stream_ = Group(type_ + mixture)

# process group
processgroup = Group(LPAR + stream + ZeroOrMore(SLASH + stream_) + RPAR)
           
# process sequence is comprised of connectors, process group, and recycles                                             
connector = Optional(GT | LT, default=GT)
recycle = Word(nums, exact=1)
sequence = Group(processgroup + ZeroOrMore(connector + (processgroup | recycle )))

# nested branches
branchsequence = OneOrMore(connector + (processgroup | recycle ))
branch = nestedExpr(opener=LBRA, closer=RBRA, content=branchsequence)

# sfiles expression start with sequence
sfiles = sequence + ZeroOrMore(branch | sequence)

In [109]:
results = sfiles.parseString('(iABC)(A/BC)2(oA)(B/C)(oB)(oC)')
results.asList()

[[[['i', ['A', 'B', 'C']]],
  ">",
  [['dist', ['A']], ['s', ['B', 'C']]],
  ">",
  '2',
  ">",
  [['o', ['A']]],
  ">",
  [['dist', ['B']], ['s', ['C']]],
  ">",
  [['o', ['B']]],
  ">",
  [['o', ['C']]]]]

In [110]:
results = sfiles.parseString('(iA)(rAB/pABCD)<1<2[<(iB)](mABC/D)[<(oD)](A/BC)1(cycB/C)2(oC)')
results.asList()

[[[['i', ['A']]],
  ">",
  [['r', ['A', 'B']], ['p', ['A', 'B', 'C', 'D']]],
  '<',
  '1',
  '<',
  '2'],
 ['<', [['i', ['B']]]],
 [[['m', ['A', 'B', 'C']], ['s', ['D']]]],
 ['<', [['o', ['D']]]],
 [[['dist', ['A']], ['s', ['B', 'C']]],
  ">",
  '1',
  ">",
  [['cyc', ['B']], ['s', ['C']]],
  ">",
  '2',
  ">",
  [['o', ['C']]]]]

## Tests

In [111]:
processgroup_tests = """\
    (A/BC)
    (ABC/DE)
    (cycA/B)
    (fABC/BCD)
    (rABC/nE/pABCD)
    (rABC/nE/pABCD)
    (swA/B)
    (pmsABC/D)
    (msABC/D)
    (lmemABC/D)
    (gmemABC/D)
    (crsABC/D)
    (abEAB/eF/EABF/EF)
    (iABCD)
    (oABD)
    (ABC/D)
    (rAB/pABD)
"""
    
sequence_tests = """\
    (iA)(rAB/pABCD)
    (iA)(oB)(iC)
    (iA)<(oB)>2
    (A/BC)2(oD)
    (A/BC)1
"""

branch_tests = """\
    (iA)[(oA)]
    (iA)[<(oD)]
    (iA)[<(A/BD)]
    (iA)[(A/BD)(BD/B)[(oA)]]
"""

sfiles_tests = """\
    (iA)(rAB/pABCD)<1<2[<(iB)](mABC/D)[<(oD)](A/BC)1(cycB/C)2(oC)
    (iB)(rAB/pABCD)<1<2[<(iA)](mABC/D)[<(oD)](A/BC)1(cycB/C)2(oC)
    (iA)(rAB/pABCD)[(iB)]    
    (iA)(rAB/pABCD)[<(iB)]
    (iA)(rAB/pABCD)<1<2[<(iB)](mABC/D)[<(oD)](A/BC)1(cycB/C)2(oC)
    (iB)(rAB/pABCD)<1<2[<(iA)](mABC/D)[<(oD)](A/BC)1(cycB/C)2(oC)
    (iA)(rAB/pABCD)[<(iB)](mABC/D)[(oD)](A/BC)[(oA)](oBC)
    (iA)[(oB)(oC)[(oD)]]
    (iABCDE)(AB/CDE)[(A/B)[(oA)](oB)]
"""

def sfiles_test(tests):
    for test in tests.split():
        try:
            result = sfiles.parseString(test)
            print('\n', test, '\n', result.asList())
        except:
            print('\n', test, 'Failed')

sfiles_test(processgroup_tests)
sfiles_test(sequence_tests)
sfiles_test(branch_tests)
sfiles_test(sfiles_tests)


 (A/BC) 
 [[[['dist', ['A']], ['s', ['B', 'C']]]]]

 (ABC/DE) 
 [[[['dist', ['A', 'B', 'C']], ['s', ['D', 'E']]]]]

 (cycA/B) 
 [[[['cyc', ['A']], ['s', ['B']]]]]

 (fABC/BCD) 
 [[[['f', ['A', 'B', 'C']], ['s', ['B', 'C', 'D']]]]]

 (rABC/nE/pABCD) 
 [[[['r', ['A', 'B', 'C']], ['n', ['E']], ['p', ['A', 'B', 'C', 'D']]]]]

 (rABC/nE/pABCD) 
 [[[['r', ['A', 'B', 'C']], ['n', ['E']], ['p', ['A', 'B', 'C', 'D']]]]]

 (swA/B) 
 [[[['sw', ['A']], ['s', ['B']]]]]

 (pmsABC/D) 
 [[[['pms', ['A', 'B', 'C']], ['s', ['D']]]]]

 (msABC/D) 
 [[[['ms', ['A', 'B', 'C']], ['s', ['D']]]]]

 (lmemABC/D) 
 [[[['lmem', ['A', 'B', 'C']], ['s', ['D']]]]]

 (gmemABC/D) 
 [[[['gmem', ['A', 'B', 'C']], ['s', ['D']]]]]

 (crsABC/D) 
 [[[['crs', ['A', 'B', 'C']], ['s', ['D']]]]]

 (abEAB/eF/EABF/EF) 
 [[[['ab', ['E', 'A', 'B']], ['e', ['F']], ['s', ['E', 'A', 'B', 'F']], ['s', ['E', 'F']]]]]

 (iABCD) 
 [[[['i', ['A', 'B', 'C', 'D']]]]]

 (oABD) 
 [[[['o', ['A', 'B', 'D']]]]]

 (ABC/D) 
 [[[['dist', ['A', 'B', 

## Flowsheets

A simplified version of SFILES is sometimes useful for describing the layout of flowsheets with the detail of describing process groups. 


In [None]:
from pyparsing import Literal, Word, Group
from pyparsing import Optional, OneOrMore, ZeroOrMore, oneOf
from pyparsing import alphas, nums
from pyparsing import nestedExpr

# process group
processgroup = Word(alphas.upper(), exact=1)
           
# process sequence is comprised of connectors, process group, and recycles                                             
connector = Optional(Literal('>') | Literal('<'), default=Literal('>'))
recycle = Word(nums, exact=1)
sequence = OneOrMore(connector + (processgroup | recycle ))

# nested branches
branch = nestedExpr(opener="[", closer="]", content=sequence)
branch.setName('branch')

# sfiles expression
sfiles = sequence + ZeroOrMore(branch | sequence)

In [37]:
result = sfiles.parseString('OA<1E[P]F[Q]G<2MN[R]J<3K[IH3]L[S]D2CB1')
result

([">", 'O', ">", 'A', '<', '1', ">", 'E', ([">", 'P'], {}), ">", 'F', ([">", 'Q'], {}), ">", 'G', '<', '2', ">", 'M', ">", 'N', ([">", 'R'], {}), ">", 'J', '<', '3', ">", 'K', ([">", 'I', ">", 'H', ">", '3'], {}), ">", 'L', ([">", 'S'], {}), ">", 'D', ">", '2', ">", 'C', ">", 'B', ">", '1'], {})

In [None]:
fo