In [10]:
import os
import numpy as np
import pandas as pd
import networkx as nx

from tuw_nlp.text.pipeline import CachedStanzaPipeline, CustomStanzaPipeline
from tuw_nlp.graph.utils import GraphMatcher
from tuw_nlp.graph.utils import (
    get_root_id,
    graph_to_isi,
    sen_to_graph
)
from stanza.models.common.doc import Document


In [11]:
config = {
    "data": [
        {"sens": [
            {"text": "This my favorite sentence."},
            {"text": "Yesterday, I had noodles."},
            {"text": "brown dog"}]}],
    "cache_dir": ".cache",
    "memory": "8G",
    "ALTO_JAR": os.path.expanduser("~/tuw_nlp_resources/alto-2.3.6-SNAPSHOT-all.jar")
    "grammar_fn"
    "input_int": "ud",
    "output_int": "fd",
    "output_codec": "amr-sgraph-src",
    "output_fn"
}

sections = config["data"]

In [12]:
nlp_pipeline = CustomStanzaPipeline(
        processors='tokenize,mwt,pos,lemma,depparse')
nlp_cache = os.path.join(config["cache_dir"], 'nlp_cache.json')

result = []

with CachedStanzaPipeline(nlp_pipeline, nlp_cache) as nlp:
    for section in sections:
        for sen in section["sens"]:
            if "tokens" not in sen:
                sen["tokens"] = nlp(sen["text"]).sentences[0].to_dict()
                
sections

2021-11-08 11:02:04 INFO: Loading these models for language: de (German):
| Processor  | Package |
------------------------
| tokenize   | gsd     |
| fix_ssplit | default |

INFO:stanza:Loading these models for language: de (German):
| Processor  | Package |
------------------------
| tokenize   | gsd     |
| fix_ssplit | default |

2021-11-08 11:02:04 INFO: Use device: gpu
INFO:stanza:Use device: gpu
2021-11-08 11:02:04 INFO: Loading: tokenize
INFO:stanza:Loading: tokenize
2021-11-08 11:02:04 INFO: Loading: fix_ssplit
INFO:stanza:Loading: fix_ssplit
2021-11-08 11:02:04 INFO: Done loading processors!
INFO:stanza:Done loading processors!
2021-11-08 11:02:04 INFO: Loading these models for language: de (German):
| Processor | Package |
-----------------------
| tokenize  | gsd     |
| mwt       | gsd     |
| pos       | gsd     |
| lemma     | gsd     |
| depparse  | gsd     |

INFO:stanza:Loading these models for language: de (German):
| Processor | Package |
-----------------------
| t

[{'sens': [{'text': 'This my favorite sentence.',
    'tokens': [{'id': 1,
      'text': 'This',
      'lemma': 'This',
      'upos': 'PROPN',
      'xpos': 'FM',
      'feats': 'Foreign=Yes',
      'head': 0,
      'deprel': 'root',
      'misc': 'start_char=0|end_char=4'},
     {'id': 2,
      'text': 'my',
      'lemma': 'my',
      'upos': 'PROPN',
      'xpos': 'FM',
      'feats': 'Foreign=Yes',
      'head': 4,
      'deprel': 'nsubj',
      'misc': 'start_char=5|end_char=7'},
     {'id': 3,
      'text': 'favorite',
      'lemma': 'favorite',
      'upos': 'PROPN',
      'xpos': 'FM',
      'feats': 'Foreign=Yes',
      'head': 4,
      'deprel': 'nsubj',
      'misc': 'start_char=8|end_char=16'},
     {'id': 4,
      'text': 'sentence',
      'lemma': 'sentence',
      'upos': 'PROPN',
      'xpos': 'FM',
      'feats': 'Foreign=Yes',
      'head': 1,
      'deprel': 'flat',
      'misc': 'start_char=17|end_char=25'},
     {'id': 5,
      'text': '.',
      'lemma': '.',
     

In [13]:

for section in sections:
    for sen in section["sens"]:
        parsed_sen = Document([sen["tokens"]]).sentences[0]
        graph = sen_to_graph(parsed_sen)
        isi = graph_to_isi(graph)
        sen["isi"] = isi
        sen["graph"] = graph
        
sections

[{'sens': [{'text': 'This my favorite sentence.',
    'tokens': [{'id': (1,),
      'text': 'This',
      'lemma': 'This',
      'upos': 'PROPN',
      'xpos': 'FM',
      'feats': 'Foreign=Yes',
      'head': 0,
      'deprel': 'root',
      'misc': 'start_char=0|end_char=4'},
     {'id': (2,),
      'text': 'my',
      'lemma': 'my',
      'upos': 'PROPN',
      'xpos': 'FM',
      'feats': 'Foreign=Yes',
      'head': 4,
      'deprel': 'nsubj',
      'misc': 'start_char=5|end_char=7'},
     {'id': (3,),
      'text': 'favorite',
      'lemma': 'favorite',
      'upos': 'PROPN',
      'xpos': 'FM',
      'feats': 'Foreign=Yes',
      'head': 4,
      'deprel': 'nsubj',
      'misc': 'start_char=8|end_char=16'},
     {'id': (4,),
      'text': 'sentence',
      'lemma': 'sentence',
      'upos': 'PROPN',
      'xpos': 'FM',
      'feats': 'Foreign=Yes',
      'head': 1,
      'deprel': 'flat',
      'misc': 'start_char=17|end_char=25'},
     {'id': (5,),
      'text': '.',
      'lem

In [17]:
"""command = ['java', f'-Xmx{config.memory}',
           '-cp', config.ALTO_JAR, 'de.up.ling.irtg.script.ParsingEvaluator',
           input_fn,
           '-g', grammar_fn,
           '-I', input_int,
           '-O', f"{config.output_int}={config.output_codec}",
           '-o', output_fn]
           """

SyntaxError: EOF while scanning triple-quoted string literal (3239587295.py, line 7)

In [18]:
from tuw_nlp.grammar.text_to_4lang import TextTo4lang

tfl = TextTo4lang("en", "en_nlp_cache")

fl_graphs = list(tfl("brown dog", depth=1, substitute=False))

# Then the fl_graphs will directly contain a networkx graph object
fl_graphs[0].nodes(data=True)

2021-11-08 11:36:53 INFO: Loading these models for language: en (English):
| Processor | Package |
-----------------------
| tokenize  | ewt     |
| pos       | ewt     |
| lemma     | ewt     |
| depparse  | ewt     |

INFO:stanza:Loading these models for language: en (English):
| Processor | Package |
-----------------------
| tokenize  | ewt     |
| pos       | ewt     |
| lemma     | ewt     |
| depparse  | ewt     |

2021-11-08 11:36:53 INFO: Use device: gpu
INFO:stanza:Use device: gpu
2021-11-08 11:36:53 INFO: Loading: tokenize
INFO:stanza:Loading: tokenize
2021-11-08 11:36:53 INFO: Loading: pos
INFO:stanza:Loading: pos
2021-11-08 11:36:54 INFO: Loading: lemma
INFO:stanza:Loading: lemma
2021-11-08 11:36:54 INFO: Loading: depparse
INFO:stanza:Loading: depparse
2021-11-08 11:36:54 INFO: Done loading processors!
INFO:stanza:Done loading processors!
Processing tmp/20211108_113706_250090/input.txt (2 instances) ...
1 [ROOT(NOUN(_AMOD(ADJ(brown)),NOUN(dog)))           ] 35 ms
2 [dummy(

NodeDataView({0: {'name': 'dog', 'expanded': True}, 1: {'name': 'brown', 'expanded': True}, 2: {'name': 'mammal'}, 3: {'name': 'familiaris'}, 4: {'name': 'domesticate'}, 5: {'name': 'of'}, 6: {'name': 'appearance'}, 7: {'name': 'variable'}, 8: {'name': 'highly'}, 9: {'name': 'due'}, 10: {'name': 'breeding'}, 11: {'name': 'human'}, 12: {'name': 'for'}, 13: {'name': 'thousand'}, 14: {'name': 'year'}, 15: {'name': 'lupus'}, 16: {'name': 'canis'}, 17: {'name': 'colour'}, 18: {'name': 'like'}, 19: {'name': 'that'}, 20: {'name': 'COORD'}, 21: {'name': 'coffee'}, 22: {'name': 'chocolate'}})

In [19]:
fl_graphs[0].edges(data=True)

OutEdgeDataView([(0, 1, {'color': 0}), (0, 2, {'color': 0}), (1, 17, {'color': 0}), (2, 3, {'color': 0}), (3, 4, {'color': 0}), (3, 15, {'color': 0}), (3, 16, {'color': 0}), (4, 3, {'color': 1}), (5, 4, {'color': 1}), (5, 6, {'color': 2}), (5, 13, {'color': 1}), (5, 14, {'color': 2}), (5, 19, {'color': 1}), (5, 20, {'color': 2}), (6, 7, {'color': 0}), (7, 8, {'color': 0}), (9, 6, {'color': 1}), (9, 10, {'color': 2}), (10, 11, {'color': 0}), (12, 4, {'color': 1}), (12, 13, {'color': 2}), (18, 17, {'color': 1}), (18, 19, {'color': 2}), (20, 21, {'color': 0}), (20, 22, {'color': 0})])