In [2]:
from typing import Iterator, List, Tuple
# import re

from depccg.cat import Category
from depccg.tree import Tree
from depccg.types import Token
from depccg.tools.reader import ReaderResult


combinators = {
    'SSEQ', '>', '<', '>B', '<B1', '<B2', '<B3',
    '<B4', '>Bx1', '>Bx2', '>Bx3',
    'ADNext', 'ADNint', 'ADV0', 'ADV1', 'ADV2'
}

# DEPENDENCY = re.compile(r'{.+?}')


def read_parsedtree(line: str) -> Iterator[ReaderResult]:
    """read the file of the Japanese CCG derivations parsed by depccg.

    Args:
        filename (str): file name string

    Yields:
        Iterator[ReaderResult]: iterator object containing parse results
    """
    i = 1
    line = line.strip()
    tree, tokens = ParsedJaCCGLineReader(line).parse()
    yield ReaderResult(str(i), tokens, tree)


class ParsedJaCCGLineReader(object):
    def __init__(self, line: str) -> None:
        self.line = line
        self.index = 0
        self.word_id = -1
        self.tokens = []

    def next(self, target: str) -> str:
        end = self.line.find(target, self.index)
        result = self.line[self.index:end]
        self.index = end + 1
        return result

    def check(self, text: str, offset: int = 0) -> None:
        if self.line[self.index + offset] != text:
            raise RuntimeError('AutoLineReader.check catches parse error')

    def peek(self) -> str:
        return self.line[self.index]

    def parse(self) -> Tuple[Tree, List[Token]]:
        result = self.next_node()
        return result, self.tokens

    @property
    def next_node(self):
        end = self.line.find(' ', self.index)
        if self.line[self.index + 1:end] in combinators:
            return self.parse_tree
        else:
            return self.parse_leaf

    def parse_leaf(self) -> Tree:
        self.word_id += 1
        self.check('{')
        cat = self.next(' ')[1:]
        # cat = cat[:cat.find('_')]
        # cat = DEPENDENCY.sub('', cat)
        cat = Category.parse(cat)
        surf, base, pos1, pos2 = self.next('}')[:-1].split('/')
        token = Token(surf=surf, base=base, pos1=pos1, pos2=pos2)
        self.tokens.append(token)
        return Tree.make_terminal(surf, cat)

    def parse_tree(self) -> Tree:
        self.check('{')
        op_string = self.next(' ')
        # cat = DEPENDENCY.sub('', self.next(' '))
        cat = self.next(' ')
        cat = Category.parse(cat)
        self.check('{')

        children = []
        while self.peek() != '}':
            children.append(self.next_node())
            if self.peek() == ' ':
                self.next(' ')

        self.next('}')

        if len(children) == 1:
            return Tree.make_unary(cat, children[0], op_string.replace("{", ""), op_string.replace("{", ""))
        else:
            assert len(
                children) == 2, f'failed to parse, invalid number of children: {self.line}'
            left, right = children
            return Tree.make_binary(cat, left, right, op_string.replace("{", ""), op_string.replace("{", ""))


In [3]:
s = r"{< S[mod=nm,form=stem,fin=t] {> S[mod=nm,form=stem,fin=f] {ADV0 S[mod=X1,form=X2,fin=X3]/S[mod=X1,form=X2,fin=X3] {< NP[case=nc,mod=adv,fin=f] {> NP[case=nc,mod=adv,fin=f] {< NP[case=X1,mod=X2,fin=f]/NP[case=X1,mod=X2,fin=f] {> NP[case=nc,mod=nm,fin=f] {NP[case=X1,mod=X2,fin=f]/NP[case=X1,mod=X2,fin=f] ９/９/名詞-数/_} {NP[case=nc,mod=nm,fin=f] ５/５/名詞-数/_}} {(NP[case=X1,mod=X2,fin=f]/NP[case=X1,mod=X2,fin=f])\NP[case=nc,mod=nm,fin=f] ―/―/記号-一般/_}} {< NP[case=nc,mod=adv,fin=f] {> NP[case=nc,mod=nm,fin=f] {NP[case=X1,mod=X2,fin=f]/NP[case=X1,mod=X2,fin=f] ９/９/名詞-数/_} {NP[case=nc,mod=nm,fin=f] ７/７/名詞-数/_}} {NP[case=nc,mod=adv,fin=f]\NP[case=nc,mod=nm,fin=f] 年/年/名詞-接尾-助数詞/_}}} {NP[case=X1,mod=X2,fin=f]\NP[case=X1,mod=X2,fin=f] 、/、/記号-読点/_}}} {> S[mod=nm,form=stem,fin=f] {ADV0 S[mod=X1,form=X2,fin=X3]/S[mod=X1,form=X2,fin=X3] {< S[mod=adv,form=cont,fin=f] {< S[mod=adv,form=cont,fin=f] {< NP[case=ni,mod=nm,fin=f] {> NP[case=nc,mod=nm,fin=f] {< NP[case=X1,mod=X2,fin=f]/NP[case=X1,mod=X2,fin=f] {NP[case=X1,mod=X2,fin=f]/NP[case=X1,mod=X2,fin=f] 中国/中国/名詞-固有名詞-地域-国/_} {(NP[case=X1,mod=X2,fin=f]/NP[case=X1,mod=X2,fin=f])\(NP[case=X1,mod=X2,fin=f]/NP[case=X1,mod=X2,fin=f]) ・/・/記号-一般/_}} {NP[case=nc,mod=nm,fin=f] 北京大/北京大/名詞-固有名詞-組織/_}} {NP[case=ni,mod=nm,fin=f]\NP[case=nc,mod=nm,fin=f] に/に/助詞-格助詞-一般/_}} {<B1 S[mod=adv,form=cont,fin=f]\NP[case=ni,mod=nm,fin=f] {S[mod=nm,form=stem,fin=f]\NP[case=ni,mod=nm,fin=f] 留学/留学/名詞-サ変接続/_} {S[mod=adv,form=cont,fin=f]\S[mod=nm,form=stem,fin=f] し/し/動詞-自立/連用形-サ変・スル}}} {S[mod=X1,form=X2,fin=f]\S[mod=X1,form=X2,fin=f] 、/、/記号-読点/_}}} {> S[mod=nm,form=stem,fin=f] {< S[mod=X1,form=X2,fin=f]/S[mod=X1,form=X2,fin=f] {< NP[case=nc,mod=nm,fin=f] {NP[case=nc,mod=nm,fin=f] 帰国/帰国/名詞-サ変接続/_} {NP[case=nc,mod=nm,fin=f]\NP[case=nc,mod=nm,fin=f] 後/後/名詞-接尾-副詞可能/_}} {(S[mod=X1,form=X2,fin=f]/S[mod=X1,form=X2,fin=f])\NP[case=nc,mod=nm,fin=f] に/に/助詞-格助詞-一般/_}} {< S[mod=nm,form=stem,fin=f] {< NP[case=o,mod=nm,fin=f] {NP[case=nc,mod=nm,fin=f] 双子/双子/名詞-一般/_} {NP[case=o,mod=nm,fin=f]\NP[case=nc,mod=nm,fin=f] を/を/助詞-格助詞-一般/_}} {S[mod=nm,form=stem,fin=f]\NP[case=o,mod=nm,fin=f] 出産/出産/名詞-サ変接続/_}}}}} {S[mod=nm,form=stem,fin=t]\S[mod=nm,form=stem,fin=f] 。/。/記号-句点/_}}"

In [8]:
trees = [tree for _, _, tree in read_parsedtree(s)]
tree = trees[0]
tree
tree.left_child.left_child.op_symbol

'ADV0'

In [3]:
s = r"{> S[mod=nm,form=base,fin=f] {>T S[mod=X1,form=X2,fin=X3]/(S[mod=X1,form=X2,fin=X3]\NP[case=X1,mod=X2,fin=X3]) {< NP[case=ga,mod=nm,fin=f] {NP[case=nc,mod=nm,fin=f] 太郎/太郎/_/_} {NP[case=ga,mod=nm,fin=f]\NP[case=nc,mod=nm,fin=f] が/が/_/_}}} {> S[mod=nm,form=base,fin=f]\NP[case=ga,mod=nm,fin=f] {>T (S[mod=X1,form=X2,fin=X3]\NP[case=X1,mod=X2,fin=X3])/((S[mod=X1,form=X2,fin=X3]\NP[case=X1,mod=X2,fin=X3])\NP[case=X1,mod=X2,fin=X3]) {< NP[case=o,mod=nm,fin=f] {NP[case=nc,mod=nm,fin=f] 花子/花子/_/_} {NP[case=o,mod=nm,fin=f]\NP[case=nc,mod=nm,fin=f] を/を/_/_}}} {<B2 (S[mod=nm,form=base,fin=f]\NP[case=ga,mod=nm,fin=f])\NP[case=o,mod=nm,fin=f] {(S[mod=nm,form=cont,fin=f]\NP[case=ga,mod=nm,fin=f])\NP[case=o,mod=nm,fin=f] 殴っ/殴っ/_/_} {S[mod=nm,form=base,fin=f]\S[mod=nm,form=cont,fin=f] た/た/_/_}}}}"
trees = [tree for _, _, tree in read_parsedtree(s)]
tree = trees[0]
tree

IndexError: pop from empty list

In [9]:
from depccg.cat import Category, Functor, Atom, Feature
import re
cat_split = re.compile(r'([\[\]\(\)/\\|<>])')
punctuations = [',', '.', ';', ':', 'LRB', 'RRB', 'conj', '*START*', '*END*']

def parse(text: str) -> 'Category':
        tokens = cat_split.sub(r' \1 ', text)
        print('print(tokens)')
        print(tokens)
        buffer = list(reversed([i for i in tokens.split(' ') if i != '']))
        print('print(buffer)')
        print(buffer)
        print()
        stack = []

        while len(buffer):
            item = buffer.pop()
            if item in punctuations:
                stack.append(Atom(item))
                print('if item in punctuations')
                print(buffer)
                print(stack)
                print()
            elif item in '(<':
                stack.append(item)
                print('elif item in (<')
                print(buffer)
                print(stack)
                print()
            elif item in ')>':
                y = stack.pop()
                # case like: stack = ["(", S/NP], buffer = [")"]
                # which can occur when parsing eg., "((S/NP))"
                print('elif item in >)')
                print(buffer)
                print(stack)
                print()
                assert len(stack) > 0
                if (
                    stack[-1] == '(' and item == ')'
                    or stack[-1] == '<' and item == '>'
                ):
                    assert stack.pop() in "(<"
                    stack.append(y)
                    print('if item in stack[-1] == ( and item == ) or stack[-1] == < and item == >')
                    print(buffer)
                    print(stack)
                    print()
                # case like: stack = ["(", S, /, NP], buffer = [")"]
                else:
                    f = stack.pop()
                    x = stack.pop()
                    assert stack.pop() in "(<"
                    stack.append(Functor(x, f, y))
                    print('else:(if item in stack[-1] == ( and item == ) or stack[-1] == < and item == >)')
                    print(buffer)
                    print(stack)
                    print()
            elif item in '/\\|':
                stack.append(item)
            else:
                # cases to process atomic category
                # 1. when there is a feature eg., buffer = ["[", "dcl", "]"]
                if len(buffer) >= 3 and buffer[-1] == '[':
                    buffer.pop()
                    feature = Feature.parse(buffer.pop())
                    assert buffer.pop() == ']'
                    stack.append(Atom(item, feature))
                    print('else if len(buffer) >= 3')
                    print(buffer)
                    print(stack)
                    print()
                # 2. case with no feature
                else:
                    stack.append(Atom(item))
                    print('else')
                    print('stack.append(Atom(item))')
                    print(buffer)
                    print(stack)
                    print()
        if len(stack) == 1:
                return stack[0]
        try:
                x, f, y = stack
                return Functor(x, f, y)
        except ValueError:
                raise RuntimeError(f'falied to parse category: {text}')

In [11]:
parse(r"S[mod=X1,form=X2,fin=X3]/(S[mod=X1,form=X2,fin=X3]\NP[case=X1,mod=X2,fin=X3])")

print(tokens)
S [ mod=X1,form=X2,fin=X3 ]  /  ( S [ mod=X1,form=X2,fin=X3 ]  \ NP [ case=X1,mod=X2,fin=X3 ]  ) 
print(buffer)
[')', ']', 'case=X1,mod=X2,fin=X3', '[', 'NP', '\\', ']', 'mod=X1,form=X2,fin=X3', '[', 'S', '(', '/', ']', 'mod=X1,form=X2,fin=X3', '[', 'S']

else if len(buffer) >= 3
[')', ']', 'case=X1,mod=X2,fin=X3', '[', 'NP', '\\', ']', 'mod=X1,form=X2,fin=X3', '[', 'S', '(', '/']
[S[mod=X1,form=X2,fin=X3]]

elif item in (<
[')', ']', 'case=X1,mod=X2,fin=X3', '[', 'NP', '\\', ']', 'mod=X1,form=X2,fin=X3', '[', 'S']
[S[mod=X1,form=X2,fin=X3], '/', '(']

else if len(buffer) >= 3
[')', ']', 'case=X1,mod=X2,fin=X3', '[', 'NP', '\\']
[S[mod=X1,form=X2,fin=X3], '/', '(', S[mod=X1,form=X2,fin=X3]]

else if len(buffer) >= 3
[')']
[S[mod=X1,form=X2,fin=X3], '/', '(', S[mod=X1,form=X2,fin=X3], '\\', NP[case=X1,mod=X2,fin=X3]]

elif item in >)
[]
[S[mod=X1,form=X2,fin=X3], '/', '(', S[mod=X1,form=X2,fin=X3], '\\']

else:(if item in stack[-1] == ( and item == ) or stack[-1] == < and 

S[mod=X1,form=X2,fin=X3]/(S[mod=X1,form=X2,fin=X3]\NP[case=X1,mod=X2,fin=X3])

In [12]:
parse(r"(S[mod=X1,form=X2,fin=X3]\NP[case=X1,mod=X2,fin=X3])/((S[mod=X1,form=X2,fin=X3]\NP[case=X1,mod=X2,fin=X3])\NP[case=X1,mod=X2,fin=X3])")

print(tokens)
 ( S [ mod=X1,form=X2,fin=X3 ]  \ NP [ case=X1,mod=X2,fin=X3 ]  )  /  (  ( S [ mod=X1,form=X2,fin=X3 ]  \ NP [ case=X1,mod=X2,fin=X3 ]  )  \ NP [ case=X1,mod=X2,fin=X3 ]  ) 
print(buffer)
[')', ']', 'case=X1,mod=X2,fin=X3', '[', 'NP', '\\', ')', ']', 'case=X1,mod=X2,fin=X3', '[', 'NP', '\\', ']', 'mod=X1,form=X2,fin=X3', '[', 'S', '(', '(', '/', ')', ']', 'case=X1,mod=X2,fin=X3', '[', 'NP', '\\', ']', 'mod=X1,form=X2,fin=X3', '[', 'S', '(']

elif item in (<
[')', ']', 'case=X1,mod=X2,fin=X3', '[', 'NP', '\\', ')', ']', 'case=X1,mod=X2,fin=X3', '[', 'NP', '\\', ']', 'mod=X1,form=X2,fin=X3', '[', 'S', '(', '(', '/', ')', ']', 'case=X1,mod=X2,fin=X3', '[', 'NP', '\\', ']', 'mod=X1,form=X2,fin=X3', '[', 'S']
['(']

else if len(buffer) >= 3
[')', ']', 'case=X1,mod=X2,fin=X3', '[', 'NP', '\\', ')', ']', 'case=X1,mod=X2,fin=X3', '[', 'NP', '\\', ']', 'mod=X1,form=X2,fin=X3', '[', 'S', '(', '(', '/', ')', ']', 'case=X1,mod=X2,fin=X3', '[', 'NP', '\\']
['(', S[mod=X1,form=X2,fin=X

(S[mod=X1,form=X2,fin=X3]\NP[case=X1,mod=X2,fin=X3])/((S[mod=X1,form=X2,fin=X3]\NP[case=X1,mod=X2,fin=X3])\NP[case=X1,mod=X2,fin=X3])