In [27]:
import sys
import os
import json
sys.path.append(os.path.abspath(os.getcwd() + '/..'))
import src.features.tree
import importlib
importlib.reload(src.features.tree)
from src.features.tree import Tree, DataTree, OrAndTree, AndOrTree, DataOnlyTree, dfs, LabeledTree, TemplateTree

In [28]:
jpn_pos_restrictions_file = os.path.abspath(
    os.getcwd() + '/../data/crafted/jpn_pos_restrictions.json')
with open(jpn_pos_restrictions_file, 'r') as f:
    pos_dict = json.load(f)
TemplateTree.prepare_restrictions(pos_dict)
print(json.dumps(pos_dict, indent='    '))
with open(jpn_pos_restrictions_file, 'r') as f:
    pos_dict = json.load(f)

{
    "expression": {
        "parents": [],
        "label": null,
        "_children": [
            "whitespace",
            "symbol",
            "word",
            "multiword",
            "syllabary",
            "alphabet"
        ],
        "_depth": 0
    },
    "whitespace": {
        "parents": [
            "expression"
        ],
        "label": "segment",
        "_children": [],
        "_depth": 1
    },
    "symbol": {
        "parents": [
            "expression"
        ],
        "label": "segment",
        "_children": [
            "punctuation"
        ],
        "_depth": 1
    },
    "word": {
        "parents": [
            "expression"
        ],
        "label": "segment",
        "_children": [
            "adverb",
            "noun",
            "adjective",
            "verb",
            "copula",
            "particle",
            "dependent",
            "conjunction",
            "interjection",
            "numeral",
            "prefix",
     

In [29]:
# Pronouns are modeled disjoint from nouns to allow for better inter-language
# compatibility
#  
# See https://japanese.stackexchange.com/a/23578 : temporal noun
# != adverbial noun, temporal noun -> noun, temporal noun -> adverb
 
# XXX Currently, there are no common classes for generic i-adj. (non-'yoi/ii
# class' adjectives); the same goes for some other word classes
# 
# XXX Monitor performance of JUMAN++ adverbial nouns, beshi/gotoshi, 名詞性特殊
# 接尾辞
# 
# XXX Maybe make one special verb class for uru
# XXX Move aux./dep. to higher layers in tree?
# 
# XXX 'wa column' and 'a column' may be the same, with 'ゑ' conjugation being a
# special class in classical Japanese; JUMAN++ classifies 'a column' godan verbs
# as belonging to 'ワ行', not 'ア行'

In [30]:
lexeme_pos_tree = TemplateTree.parse(['ra column', '-aru special class', 'monograde', 'upper class', 'prefix', 'noun'],
                        pos_dict)
print(lexeme_pos_tree)

╶╴[33m'expression'[0m
  ╰──╴[36m'segment'[0m: [33m'word'[0m
      ├─╮ [36m'class'[0m
      │ ├╴[33m'verb'[0m
      │ │ ╰─╮ [36m'class'[0m
      │ │   ├╴[33m'quintigrade'[0m
      │ │   │ ├──╴[36m'ending'[0m: [33m'ra column'[0m
      │ │   │ ╰──╴[36m'inflection'[0m: [33m'-aru special class'[0m
      │ │  [31m╶[0m│[31m╌╌╌╌╌╌╴[0m
      │ │   ╰╴[33m'monograde'[0m
      │ │     ╰──╴[36m'class'[0m: [33m'upper class'[0m
      │[31m╶[0m│[31m╌╌╌╌╌╌╴[0m
      │ ╰╴[33m'noun'[0m
     [32m╶[0m│[32m╴ ╶╌╌╌╌╌╴[0m
      ╰──╴[36m'affix'[0m: [33m'prefix'[0m


In [31]:
token_pos_tree = DataOnlyTree('expression')
word = token_pos_tree.attach('word')
adjective = word.attach('adjective')
adjective.attach('no-adjective')
adjective.attach('na-adjective')
noun = word.attach('noun')
noun.attach('proper noun')
verb = word.attach('verb')
verb.attach('monograde').attach('upper class')
verb.attach('monograde').attach('lower class')
verb.attach('transitive')
word.attach('suru verb')
word.attach('suffix')

match_score, match_result = lexeme_pos_tree.score(token_pos_tree)
print(lexeme_pos_tree)
print()
dfs(token_pos_tree)
print()
print(match_score)
print()
if match_result is not None:
    dfs(match_result)
    print()

╶╴[33m'expression'[0m
  ╰──╴[36m'segment'[0m: [33m'word'[0m
      ├─╮ [36m'class'[0m
      │ ├╴[33m'verb'[0m
      │ │ ╰─╮ [36m'class'[0m
      │ │   ├╴[33m'quintigrade'[0m
      │ │   │ ├──╴[36m'ending'[0m: [33m'ra column'[0m
      │ │   │ ╰──╴[36m'inflection'[0m: [33m'-aru special class'[0m
      │ │  [31m╶[0m│[31m╌╌╌╌╌╌╴[0m
      │ │   ╰╴[33m'monograde'[0m
      │ │     ╰──╴[36m'class'[0m: [33m'upper class'[0m
      │[31m╶[0m│[31m╌╌╌╌╌╌╴[0m
      │ ╰╴[33m'noun'[0m
     [32m╶[0m│[32m╴ ╶╌╌╌╌╌╴[0m
      ╰──╴[36m'affix'[0m: [33m'prefix'[0m

╶╴[33m'expression'[0m
  ╰╴[33m'word'[0m
    ├╴[33m'adjective'[0m
    │ ├╴[33m'no-adjective'[0m
    │ ╰╴[33m'na-adjective'[0m
    ├╴[33m'noun'[0m
    │ ╰╴[33m'proper noun'[0m
    ├╴[33m'verb'[0m
    │ ├╴[33m'monograde'[0m
    │ │ ╰╴[33m'upper class'[0m
    │ ├╴[33m'monograde'[0m
    │ │ ╰╴[33m'lower class'[0m
    │ ╰╴[33m'transitive'[0m
    ├╴[33m'suru verb'[0m
    ╰╴[33m'suffi

  dfs(node, prefix + ('\u2502 ' if next_sibling else '  '), i < l - 1)
  dfs(node, prefix + ('\u2502 ' if next_sibling else '  '), i < l - 1)
  dfs(node, prefix + ('\u2502 ' if next_sibling else '  '), i < l - 1)
  dfs(node, prefix + ('\u2502 ' if next_sibling else '  '), i < l - 1)
  dfs(node, prefix + ('\u2502 ' if next_sibling else '  '), i < l - 1)
  dfs(node, prefix + ('\u2502 ' if next_sibling else '  '), i < l - 1)
  dfs(node, prefix + ('\u2502 ' if next_sibling else '  '), i < l - 1)
  dfs(node, prefix + ('\u2502 ' if next_sibling else '  '), i < l - 1)
  dfs(node, prefix + ('\u2502 ' if next_sibling else '  '), i < l - 1)
  dfs(node, prefix + ('\u2502 ' if next_sibling else '  '), i < l - 1)
  dfs(node, prefix + ('\u2502 ' if next_sibling else '  '), i < l - 1)
  dfs(node, prefix + ('\u2502 ' if next_sibling else '  '), i < l - 1)
  dfs(node, prefix + ('\u2502 ' if next_sibling else '  '), i < l - 1)
  dfs(node, prefix + ('\u2502 ' if next_sibling else '  '), i < l - 1)
  dfs(

In [32]:
print(lexeme_pos_tree, end='\n\n')
token_pos_tree = TemplateTree.parse(['verb', 'quintigrade', 'monograde', 'upper class'], pos_dict)
print(token_pos_tree, end='\n\n')
match_score, match_result = lexeme_pos_tree.score(token_pos_tree)
print(match_score, end='\n\n')
print(match_result)

╶╴[33m'expression'[0m
  ╰──╴[36m'segment'[0m: [33m'word'[0m
      ├─╮ [36m'class'[0m
      │ ├╴[33m'verb'[0m
      │ │ ╰─╮ [36m'class'[0m
      │ │   ├╴[33m'quintigrade'[0m
      │ │   │ ├──╴[36m'ending'[0m: [33m'ra column'[0m
      │ │   │ ╰──╴[36m'inflection'[0m: [33m'-aru special class'[0m
      │ │  [31m╶[0m│[31m╌╌╌╌╌╌╴[0m
      │ │   ╰╴[33m'monograde'[0m
      │ │     ╰──╴[36m'class'[0m: [33m'upper class'[0m
      │[31m╶[0m│[31m╌╌╌╌╌╌╴[0m
      │ ╰╴[33m'noun'[0m
     [32m╶[0m│[32m╴ ╶╌╌╌╌╌╴[0m
      ╰──╴[36m'affix'[0m: [33m'prefix'[0m

╶╴[33m'expression'[0m
  ╰──╴[36m'segment'[0m: [33m'word'[0m
      ╰──╴[36m'class'[0m: [33m'verb'[0m
          ╰─╮ [36m'class'[0m
            ├╴[33m'quintigrade'[0m
           [31m╶[0m│[31m╌╌╌╌╌╌╴[0m
            ╰╴[33m'monograde'[0m
              ╰──╴[36m'class'[0m: [33m'upper class'[0m

0.6950893220270075

╶╴[33m'expression'[0m
  ╰──╴[36m'segment'[0m: [33m'word'[0m
      ╰──╴

In [33]:
tree = LabeledTree('expression')
tree['segment', 'word'] = LabeledTree('word')
tree['segment'] = LabeledTree('multiword')
verb = tree['segment', 'word', 'class'] = LabeledTree('verb')
noun = tree['segment', 'word', 'class'] = LabeledTree('noun')
adj = tree['segment', 'word', 'class'] = LabeledTree('adjective')
tree['segment', 'word', 'pronoun'] = LabeledTree('pronoun')
tree['segment', 'word', 'affix'] = LabeledTree('suffix')
tree['segment', 'word', 'affix'] = LabeledTree('prefix')
adj['class'] = LabeledTree('i-adjective')
adj['class', 'i-adjective', 'variation'] = LabeledTree('yoi/ii class')
adj['class'] = LabeledTree('ku-adjective')
adj['class'] = LabeledTree('na-adjective')
verb['class'] = LabeledTree('quintigrade')
verb['class'] = LabeledTree('monograde')
tree['segment', 'word', 'suru'] = LabeledTree('suru verb')
tree['segment', 'word', 'suru', 'suru verb', 'class'] = LabeledTree('noun')
verb['transitivity'] = LabeledTree('transitive')
verb['transitivity'] = LabeledTree('intransitive')
del tree['segment', 'word', 'class', 'verb', 'transitivity', 'intransitive']
print(tree)

# Toy grammar of German
deu_pos_restrictions_file = os.path.abspath(
    os.getcwd() + '/../data/crafted/deu_pos_restrictions.json')
tree = TemplateTree('expression', restrictions=deu_pos_restrictions_file)
word = tree['word'] = TemplateTree('word')
multiword = tree['multiword'] = TemplateTree('multiword')
for structure in (word, multiword):
    structure['verb'] = TemplateTree('verb')
    structure['verb'].attach('weak')
    structure['verb'].attach('strong')
    structure['verb'].attach('irregular')
    structure['verb', 'transitive'] = TemplateTree('transitive')
    structure['verb', 'intransitive'] = TemplateTree('intransitive')
    structure['noun'] = TemplateTree('noun')
    structure['noun', 'male'] = TemplateTree('male')
    structure['noun', 'female'] = TemplateTree('female')
    structure['noun', 'neuter'] = TemplateTree('neuter')
    structure['noun', 'proper'] = TemplateTree('proper')
    # tree[structure._data, 'article'] = TemplateTree('article')
    # tree[structure._data, 'article', 'indefinite'] = TemplateTree('indefinite')
    # structure['article', 'definite'] = TemplateTree('definite')
    pronoun = structure['pronoun'] = TemplateTree('pronoun')
    pronoun['indefinite'] = TemplateTree('indefinite')
    pronoun['personal'] = TemplateTree('personal')
    pronoun['posessive'] = TemplateTree('posessive')
    pronoun['reflexive'] = TemplateTree('reflexive')
    pronoun['demonstrative'] = TemplateTree('demonstrative')
    pronoun['interrogative'] = TemplateTree('interrogative')
    pronoun['relative'] = TemplateTree('relative')
    structure['adjective'] = TemplateTree('adjective')
    structure['adjective', 'gradable'] = TemplateTree('gradable')
    structure['adjective', 'non-gradable'] = TemplateTree('non-gradable')
    structure['numeral'] = TemplateTree('numeral')
    structure['numeral', 'cardinal'] = TemplateTree('cardinal')
    structure['numeral', 'ordinal'] = TemplateTree('ordinal')
    structure['adverb'] = TemplateTree('adverb')
    structure['preposition'] = TemplateTree('preposition')
    structure['conjunction'] = TemplateTree('conjunction')
    structure['conjunction', 'subordinate'] = TemplateTree('subordinate')
    structure['conjunction', 'coordinate'] = TemplateTree('coordinate')
    structure['interjection'] = TemplateTree('interjection')
    structure['prefix'] = TemplateTree('prefix')
    structure['suffix'] = TemplateTree('suffix')
for label, children in multiword[()]:
    print(label)
    for child in children:
        print(repr(child))

print('word' in tree)
print(tree.is_valid_data('verb'))
print(tree)
print(tree._restrictions['article'])
print(tree['word', 'verb'])

╶╴[33m'expression'[0m
  ╰─╮ [36m'segment'[0m
    ├╴[33m'word'[0m
    │ ├─╮ [36m'class'[0m
    │ │ ├╴[33m'verb'[0m
    │ │ │ ├─╮ [36m'class'[0m
    │ │ │ │ ├╴[33m'quintigrade'[0m
    │ │ │ │ ╰╴[33m'monograde'[0m
    │ │ │[32m╶[0m│[32m╴ ╶╌╌╌╌╌╴[0m
    │ │ │ ╰──╴[36m'transitivity'[0m: [33m'transitive'[0m
    │ │[31m╶[0m│[31m╌╌╌╌╌╌╴[0m
    │ │ ├╴[33m'noun'[0m
    │ │[31m╶[0m│[31m╌╌╌╌╌╌╴[0m
    │ │ ╰╴[33m'adjective'[0m
    │ │   ╰─╮ [36m'class'[0m
    │ │     ├╴[33m'i-adjective'[0m
    │ │     │ ╰──╴[36m'variation'[0m: [33m'yoi/ii class'[0m
    │ │    [31m╶[0m│[31m╌╌╌╌╌╌╴[0m
    │ │     ├╴[33m'ku-adjective'[0m
    │ │     ╰╴[33m'na-adjective'[0m
    │[32m╶[0m│[32m╴ ╶╌╌╌╌╌╴[0m
    │ ├──╴[36m'pronoun'[0m: [33m'pronoun'[0m
    │[32m╶[0m│[32m╴ ╶╌╌╌╌╌╴[0m
    │ ├─╮ [36m'affix'[0m
    │ │ ├╴[33m'suffix'[0m
    │ │ ╰╴[33m'prefix'[0m
    │[32m╶[0m│[32m╴ ╶╌╌╌╌╌╴[0m
    │ ╰──╴[36m'suru'[0m: [33m'suru verb'[0m
    │     ╰──╴

In [34]:
dump = json.dumps(lexeme_pos_tree.to_dict())
print(TemplateTree.from_dict(json.loads(dump), jpn_pos_restrictions_file))

╶╴[33m'expression'[0m
  ╰──╴[36m'segment'[0m: [33m'word'[0m
      ├─╮ [36m'class'[0m
      │ ├╴[33m'verb'[0m
      │ │ ╰─╮ [36m'class'[0m
      │ │   ├╴[33m'quintigrade'[0m
      │ │   │ ├──╴[36m'ending'[0m: [33m'ra column'[0m
      │ │   │ ╰──╴[36m'inflection'[0m: [33m'-aru special class'[0m
      │ │  [31m╶[0m│[31m╌╌╌╌╌╌╴[0m
      │ │   ╰╴[33m'monograde'[0m
      │ │     ╰──╴[36m'class'[0m: [33m'upper class'[0m
      │[31m╶[0m│[31m╌╌╌╌╌╌╴[0m
      │ ╰╴[33m'noun'[0m
     [32m╶[0m│[32m╴ ╶╌╌╌╌╌╴[0m
      ╰──╴[36m'affix'[0m: [33m'prefix'[0m


In [35]:
lexeme_pos_tree = TemplateTree.parse([], pos_dict)
print(lexeme_pos_tree)

╶╴[33m'expression'[0m
