In [1]:
#
# Minimal yaml-like language supporting only seven line patterns
#

# [Supported cases]
#   case p0. xyz
#   case p1. xyz:
#   case p2. xyz: abc
#   case p3. -
#   case p4. - xyz
#   case p5. - xyz:
#   case p6. - xyz: abc

def parse_line(line):
    # @return
    #   pattern: int \in -1, 0, .., 6
    #   val1: str
    #   val2: str
    q1 = line[-1:] == ':'
    q2 = len(line.split(': ')) == 2
    q3 = line[:1] == '-'
    q4 = line[:2] == '- '
    
    p0 = not any([q1, q2, q3])
    p1 = q1 and not any([q3])
    p2 = q2 and not any([q3, q4])
    p3 = line == '-'
    p4 = q4 and not any([q1, q2])
    p5 = q4 and q1
    p6 = q4 and q2

    typ = -1
    for i, p in enumerate([p0, p1, p2, p3, p4, p5, p6]):
        if p:
            typ = i
            break
    
    val1, val2 = None, None
    if typ == 0: val1 = line
    if typ == 1: val1 = line[:-1]
    if typ == 2: val1, val2 = line.split(': ')
    if typ == 3: pass
    if typ == 4: val1 = line[2:]
    if typ == 5: val1 = line[2:-1]
    if typ == 6: val1, val2 = line[2:].split(': ')

    return typ, val1, val2

In [5]:
import types
State = types.SimpleNamespace

def separate_indent(line): # str -> (int, str)
    new_line = line.lstrip()
    indent = len(line) - len(new_line)
    return indent, new_line


def separate_lines(lines, indent): # [str], int -> ([str], [str])
    inner_lines = []
    n = 0
    for line in lines:
        new_indent, new_line = separate_indent(line)
        if new_indent < indent:
            break
        inner_lines.append(line)
        n += 1
    return inner_lines, lines[n:]


def parse_rec(lines, state, debug=False):
    # @param
    #   lines: [str]
    #   state (inout)
    #     tmp: Yaml
    #     indent: int
    # @return
    #   result: Yaml
    #   lines: [str]
    if len(lines) == 0:
        return state.tmp, []

    line, *lines = lines
    curr_indent, curr_line = separate_indent(line)
    if debug:
        print(f"[debug:{state.indent}] tmp = {state.tmp}")
        print(f"[debug:{state.indent}] line = {line}")

    assert state.indent == curr_indent, f"[parse_rec] Not supported indent : \"{line}\""

    #
    # Parse line
    #
    p, val1, val2 = parse_line(curr_line)
    if debug:
        print(f"[debug:{state.indent}] p = {p}")
    message = f"[parse_rec] Not supported line pattern ({p}) : \"{line}\""
    if p == 0:
        assert type(state.tmp) == type(None), message
    if p in [1, 2]:
        assert type(state.tmp) in [type(None), dict], message
        if type(state.tmp) == type(None):
            state.tmp = {}
    if p in [3, 4, 5, 6]:
        assert type(state.tmp) in [type(None), list], message
        if type(state.tmp) == type(None):
            state.tmp = []

    #
    # Update temporary result (state.tmp) or return based on 7 patterns
    #
    if p == 0:
        return val1, lines

    if p == 1:
        inner_state = State(tmp=None, indent=state.indent + 2)
        inner_lines, lines = separate_lines(lines, inner_state.indent)
        inner_result, _ = parse_rec(inner_lines, inner_state, debug=debug)
        state.tmp[val1] = inner_result

    if p == 2:
        state.tmp[val1] = val2
        
    if p == 3:
        inner_state = State(tmp=None, indent=state.indent + 2)
        inner_lines, lines = separate_lines(lines, inner_state.indent)
        inner_result, _ = parse_rec(inner_lines, inner_state, debug=debug)
        state.tmp.append(inner_result)

    if p == 4:
        state.tmp.append(val1)

    if p == 5:
        inner4_state = State(tmp=None, indent=state.indent + 4)
        inner4_lines, lines = separate_lines(lines, inner4_state.indent)
        inner4_result, _ = parse_rec(inner4_lines, inner4_state, debug=debug)

        inner2_state = State(tmp={val1 : inner4_result}, indent=state.indent + 2)
        inner2_lines, lines = separate_lines(lines, inner2_state.indent)
        inner2_result, _ = parse_rec(inner2_lines, inner2_state, debug=debug)
        
        state.tmp.append(inner2_result)

    if p == 6:
        inner2_state = State(tmp={val1 : val2}, indent=state.indent + 2)
        inner2_lines, lines = separate_lines(lines, inner2_state.indent)
        inner2_result, _ = parse_rec(inner2_lines, inner2_state, debug=debug)        
        
        state.tmp.append(inner2_result)

    #
    # Parse rest of the lines
    #
    return parse_rec(lines, state, debug=debug)


def preprocess_lines(lines):
    def gen():
        for line in lines:
            # Ignore line comment
            if '#' in line:
                line = line[:line.index('#')]
                
            # Ignore trailing whilespaces 
            line = line.rstrip()
            
            # Ignore empty line
            if len(line) > 0:
                yield line

    return list(gen())


def parse(text, debug=False):
    lines = text.splitlines()
    lines = preprocess_lines(lines)
    state = State(tmp=None, indent=0)
    result, lines = parse_rec(lines, state, debug=debug)
    assert lines == [], "Not all text is not consumed"
    return result

In [3]:
#
# Example input
#

ex00 = """\
camera:
  type: MyCamera
  params:
    camera_loc: (1, 1, 1)
    lookat_loc: (0, 0, 0)
    up_vec: (0, 1, 0)
    
scene:
  type: MyScene
  params:
    file: data/bunny/reconstruction/bun_zipper_res2.ply

integrator:
  type: NormalIntegrator
  params:
"""

ex01 = """\
key1:
  key1-1:
    - 1
    - 2: 3
    -
      4: 5
    - 6:
        7: 8
  key1-2:
"""

In [4]:
import json
print(":: ex00")
print(json.dumps(parse(ex00, debug=False), indent=2))
print()

print(":: ex01")
print(json.dumps(parse(ex01, debug=False), indent=2))

:: ex00
{
  "camera": {
    "type": "MyCamera",
    "params": {
      "camera_loc": "(1, 1, 1)",
      "lookat_loc": "(0, 0, 0)",
      "up_vec": "(0, 1, 0)"
    }
  },
  "scene": {
    "type": "MyScene",
    "params": {
      "file": "data/bunny/reconstruction/bun_zipper_res2.ply"
    }
  },
  "integrator": {
    "type": "NormalIntegrator",
    "params": null
  }
}

:: ex01
{
  "key1": {
    "key1-1": [
      "1",
      {
        "2": "3"
      },
      {
        "4": "5"
      },
      {
        "6": {
          "7": "8"
        }
      }
    ],
    "key1-2": null
  }
}


In [1]:
def read_string_literal(line):
    # @param line : str
    # @return
    #   result : str
    #   num    : int (number of character consumed from `line`)
    assert line[0] == '\"'
    i = 1
    s = ''  # result string
    while True:
        c = line[i:i+1]
        i += 1
        
        # error
        ls = dict([
            ['',   'invalid EOI'],
            ['\n', 'invalid EOL'],
        ])
        assert (c not in ls), ls[c]

        # finish
        if c == '"':
            break
            
        # escape character
        if c == '\\':
            cc = line[i:i+1]
            i += 1
            assert cc, 'invalid backslash'
            ls = dict([
                ['n',  '\n'],
                ['\"', '\"'],
                ['\"', '\"'],
                ['\\', '\\'],
            ])
            assert (cc in ls), f"invalid escape character \\{cc}"
            s += ls[cc]
            continue
            
        # normal character
        s += c

    return s, i

# test
read_string_literal("""\
"ab\\"c" jsdfj
""")

('ab"c', 7)

In [2]:
#
# Tokens
#   Minus, Colon, Key, String, Indent, Dedent, Newline, EOI
#
# Grammer
#   Input = Newline* Expr Newline* EOI
#   Expr = SimpleExpr | CompoundExpr
#   SimpleExpr   = String Newline
#   CompoundExpr = Dict | List
#   Suite = SimpleExpr | (Newline Indent Expr Dedent)
#   Dict = DictItem+
#   List = ListItem+
#   DictItem = Key Suite
#   ListItem = Minus Suite
#
# NOTE:
#   The use of explicit "Indent", "Dedent", "Newline" follows what python does
#   (cf. https://github.com/python/cpython/blob/master/Grammar/Grammar)
#
# TODO:
#   - support line/column infomation
#

import dataclasses

@dataclasses.dataclass
class Token:
    name : None
    info : None  = None
    line_num : None = None
    column_num : None = None


def handle_quoted_string(s, m):
    result, num = read_string_literal(s)
    return s[num:], Token(name='String', info=result)


yaml_token_rule= [
    # Regex, Action
    #   where Action: (str, match) -> (str, Optional[Token])
    ['#',                              lambda s, m: ('',    None)],
    ['-(?!\S)' ,                       lambda s, m: (s[1:], Token('Minus'))],
    ['\"',                             handle_quoted_string],
    ['([_a-zA-Z][a-zA-Z0-9]*):(?!\S)', lambda s, m: (s[m.end():], Token('Key', m.group(1)))],
    ['\S*',                            lambda s, m: (s[m.end():], Token('String', s[:m.end()]))],
]


def run_rule(s, rule): # str, List[MatchAction] -> str, Optional[Token]
    import re    
    for regex, action in rule:
        m = re.match(regex, s)
        if not m:
            continue
        s, token = action(s, m)
        return s, token
    assert False, f"No matching token found: \"{s}\""


def tokenize_content(text, rule):
    column_num = 0
    tokens = []
    while len(text) > 0:
        old_text = text
        text, token = run_rule(text, rule)
        if token:
            token.column_num = column_num
            tokens += [token]
        text = text.lstrip()
        column_num += len(old_text) - len(text)
    return tokens


def tokenize(text, rule, yaml_hack=True):
    import io
    inp = io.StringIO(text)
    indent_stack = [0]
    line_num = 0
    physical_line_num = 0
    while True:
        line = inp.readline()
        physical_line_num += 1
        line_num = physical_line_num
        if line == '':
            break
            
        # continue read "logical" line (backslash)
        while line[-1:] == '\\':
            line += inp.readline()
            physical_line_num += 1

        # ignore empty line
        line_lstrip = line.lstrip()
        if line_lstrip == '':
            continue

        content_tokens = tokenize_content(line_lstrip, rule)
        
        # ignore empty line due to comment
        if len(content_tokens) == 0:
            continue

        indent = len(line) - len(line_lstrip)
        
        # push indent stack
        if indent_stack[-1] < indent:
            indent_stack += [indent]
            yield Token('Indent')

        # pop indent stack
        if indent < indent_stack[-1]:
            while indent < indent_stack[-1]:
                indent_stack.pop()
                yield Token('Dedent')
            assert indent == indent_stack[-1], f"No matching indentation on dedent: \"{line_lstrip}\""

        # fille line/column number
        for token in content_tokens:
            token.line_num = line_num
            token.column_num += indent

        if yaml_hack:
            # fake line and indent when "- " in order to support e.g. "- x: y"
            if content_tokens[0].name == 'Minus':
                indent_stack += [indent + 2]
                yield content_tokens[0]
                for name in ['Newline', 'Indent']:
                    yield Token(name, line_num=line_num)
                content_tokens = content_tokens[1:]
                if len(content_tokens) == 0:
                    continue

        yield from iter(content_tokens)
        yield Token('Newline', line_num=line_num)

    # flush indent stack
    while 0 < indent_stack[-1]:
        indent_stack.pop()
        yield Token('Dedent', line_num=line_num)

    yield Token('EOI', line_num=line_num)

ex03 = """
a:
  b: c
"""

ex04 = """
- a
- b: c
"""

list(tokenize(ex03, yaml_token_rule, yaml_hack=True))
list(tokenize(ex04, yaml_token_rule, yaml_hack=True))
None

In [3]:
class LookaheadIterator:
    def __init__(self, orig_iter, eoi):
        self.buffer = []
        self.orig_iter = orig_iter
        self.eoi = eoi
        
    def next_or_eoi(self):
        try:
            return next(self.orig_iter)
        except StopIteration:
            return self.eoi
    
    def lookahead(self, n):
        while len(self.buffer) < n:
            self.buffer += [self.next_or_eoi()]
        return self.buffer[:n]
    
    def __next__(self):
        if len(self.buffer) > 0:
            elem, *self.buffer = self.buffer
            return elem
        return self.next_or_eoi()


class Parser:
    def __init__(self, token_gen, debug=False):
        self.gen = LookaheadIterator(token_gen, Token('EOI'))
        self.debug = debug

    def run(self):
        pass

    # consume
    def c(self, name):
        tok, = self.gen.lookahead(1)
        assert tok.name == name, f"Expected <{name}>, but found <{tok}>"
        next(self.gen)
        return tok.info

    # match
    def m(self, name):
        tok, = self.gen.lookahead(1)        
        return tok.name == name
    
    def p_input(self):
        while self.m('Newline'): self.c('Newline')
        expr = self.p_expr()
        if self.debug:
            print(f"[debug:p_input] expr = {expr}")
        while self.m('Newline'): self.c('Newline')
        self.c('EOI')
        return expr

    def p_expr(self):
        if self.m('String'):
            return self.p_simple_expr()
        if self.m('Key'):
            return self.p_dict()
        if self.m('Minus'):
            return self.p_list()
        assert False, f"Expected [Expr], but found <{self.gen.lookahead(1)[0]}>"

    def p_simple_expr(self):
        value = self.c('String')
        self.c('Newline')
        return value

    def p_suite(self):
        if self.m('String'):
            return self.p_simple_expr()            
        if self.m('Newline'):
            self.c('Newline')
            self.c('Indent')
            expr = self.p_expr()
            self.c('Dedent')
            return expr
        assert False, f"Expected [Suite], but found <{self.gen.lookahead(1)[0]}>"

    def p_dict(self):
        items = []
        items += [self.p_dict_item()]
        while self.m('Key'):
            items += [self.p_dict_item()]
        return dict(items)
    
    def p_dict_item(self):
        key = self.c('Key')
        value = self.p_suite()
        return (key, value)

    def p_list(self):
        ls = []
        ls += [self.p_list_item()]
        while self.m('Minus'):
            ls += [self.p_list_item()]
        return ls  

    def p_list_item(self):
        self.c('Minus')
        return self.p_suite()


def parse(text, debug=False, yaml_hack_tokenizer=True):
    if debug:
        token_gen = tokenize(text, yaml_token_rule, yaml_hack_tokenizer)
        print(f"[debug:parse] tokens = ", *list(token_gen), sep='\n')
    token_gen = tokenize(text, yaml_token_rule, yaml_hack_tokenizer)
    parser = Parser(token_gen, debug=debug)
    return parser.p_input()


#
# Small tests
#

ex00 = """
k1: v1
k2: v2
"""

ex01 = """
- a
- b
"""

ex02 = """
a:
  - b
"""

ex03 = """
a:
  b: c
"""


ex04 = """
- a
"""

ex05 = """
a:
  b
"""

ex06 = """
-
  - v1
"""

ex07 = """
a:
  b: c
d: e
"""

ex08 = """
a-b:c/d\e
"""

ex09 = """
"abc \\n \'x - : # y\' "
"""

ex10 = """
a: b
  # xyz
c: d
"""

ex11 = """\
- 
  - a
  c: d
"""


yaml_hack_ex00 = """
- a: b
  c: d
"""


parse(ex00, debug=False)
parse(ex01, debug=False)
parse(ex02, debug=False)
parse(ex03, debug=False)
parse(ex04, debug=False)
parse(ex05, debug=False)
parse(ex06, debug=False)
parse(ex07, debug=False)
parse(ex08, debug=False)
parse(ex09, debug=False)
parse(yaml_hack_ex00, debug=False)

def check_throw(func, error_str):
    try:
        func()
    except:
        import sys
        error_type, error, trace = sys.exc_info()
        assert error_str == str(error)
    else:
        assert False, "check_throw failed"

check_throw(
    lambda: parse(yaml_hack_ex00, debug=False, yaml_hack_tokenizer=False),
    "Expected [Suite], but found <Token(name='Key', info='a', line_num=2, column_num=2)>")

check_throw(
    lambda: parse(ex11),
    "Expected <Dedent>, but found <Token(name='Key', info='c', line_num=3, column_num=2)>")