In [1]:
from lxml import etree
import markdown as md
import anoi
from anoi import loaders, wordnet as wn
import spacy

In [2]:
wordnet_namespace = wn.main()
space = wordnet_namespace.space

In [3]:
article_namespace = anoi.basis.ANOINamespace(space, 'articles')
mdl = loaders.ANOIMarkdownLoader(article_namespace)

In [5]:
with open('../README.md') as fp:
    readme_md = fp.read()
    readme_struct = mdl.load(readme_md)
    readme_elem = mdl.html

In [6]:
nlp = spacy.load('en_core_web_trf')

In [7]:
import dataclasses
dataclasses.astuple(readme_struct)

(0,
 <Element html at 0x290d0cfc0>,
 ((0, <Element body at 0x2b6d816c0>, ()),
  (1,
   <Element h1 at 0x10390c300>,
   ((1, <Element p at 0x2a0e52b00>, ()),
    (2, <Element h2 at 0x2a0f47d00>, ((2, <Element p at 0x2a0e01f80>, ()),)),
    (2,
     <Element h2 at 0x2a0e75a00>,
     ((3,
       <Element h3 at 0x1037c4fc0>,
       ((3, <Element p at 0x2a10cf040>, ()),
        (4,
         <Element h4 at 0x2a0e09780>,
         ((4, <Element ul at 0x2a10b1d80>, ()),)))),
      (3,
       <Element h3 at 0x2a107a500>,
       ((3, <Element p at 0x2a10ad3c0>, ()),
        (3, <Element p at 0x2a10ef040>, ()))),
      (3,
       <Element h3 at 0x2a0b90900>,
       ((3, <Element p at 0x2a0b904c0>, ()),
        (3, <Element p at 0x2a0f31e40>, ()))),
      (3,
       <Element h3 at 0x2a0b94100>,
       ((3, <Element p at 0x11e8dd900>, ()),
        (4,
         <Element h4 at 0x11e8dd7c0>,
         ((4, <Element ul at 0x2a0e645c0>, ()),)))))),
    (2, <Element h2 at 0x2a0f2cec0>, ((2, <Element p at 0

In [12]:
parent_property = loaders.article.make_parent_property(readme_struct)
timestamp_property = loaders.article.make_timestamp_property(readme_struct)
walk_article = loaders.article.walk_article

for elem in walk_article(readme_struct):
    if len(elem.children) > 0:
        print('_' * 70)
        print(elem.level, elem.elem.text)
        parent = parent_property[elem]
        if parent is not None:
            print(parent.level, parent.elem.text)
        else:
            print(parent)
        print(timestamp_property[elem])


______________________________________________________________________
0 None
None
None
______________________________________________________________________
1 A Network of Ideas
0 None
None
______________________________________________________________________
2 ANOI Web Application
1 A Network of Ideas
None
______________________________________________________________________
2 ANOI Design
1 A Network of Ideas
None
______________________________________________________________________
3 Types
2 ANOI Design
None
______________________________________________________________________
4 Properties
3 Types
None
______________________________________________________________________
3 Strings
2 ANOI Design
None
______________________________________________________________________
3 Media
2 ANOI Design
None
______________________________________________________________________
3 Articles
2 ANOI Design
None
______________________________________________________________________
4 Article Cr

In [10]:
from spacy import displacy

In [13]:
readme_text = '\n'.join(e.elem.text.replace('\n', ' ') for e in walk_article(readme_struct) if e.elem.text is not None)
readme_doc = nlp(readme_text)



In [14]:
len(readme_doc)

1323

In [15]:
tok0 = readme_doc[0]
tok1 = readme_doc[1]
tok2 = readme_doc[2]
tok3 = readme_doc[3]
toks = tok0, tok1, tok2, tok3
[(tok.text, tok.lemma_) for tok in toks]

[('A', 'a'), ('Network', 'Network'), ('of', 'of'), ('Ideas', 'Ideas')]

In [16]:
readme_sents = list(readme_doc.sents)
len(readme_sents), readme_sents[0]

(69, A Network of Ideas)

In [17]:
sent = readme_sents[0]
displacy.render(sent, style='dep')

In [18]:
for index, token in enumerate(readme_sents[0]):
    print(index, token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)
    print()

0 A a DET DT det X True True

1 Network Network PROPN NNP ROOT Xxxxx True False

2 of of ADP IN prep xx True True

3 Ideas Ideas PROPN NNPS pobj Xxxxx True False

