In [1]:
import os.path
import pickle
from lxml import etree
import markdown as md
import anoi
from anoi import loaders, wordnet as wn
from anoi.facade import get_facade
import spacy

In [2]:
if os.path.exists('space.pkl'):
    with open('space.pkl', 'rb') as pkl_fp:
        space = pickle.load(pkl_fp)
    facade = get_facade(space)
    wordnet_namespace = facade.namespace
else:
    facade = get_facade()
    wordnet_namespace = facade.namespace
    with open('space.pkl', 'wb') as pkl_fp:
        pickle.dump(wordnet_namespace.space, pkl_fp)

<anoi.basis.ANOIInMemorySpace object at 0x10590a0a0>


In [3]:
article_namespace = anoi.basis.ANOINamespace(space, 'articles')
mdl = loaders.ANOIMarkdownLoader(article_namespace)

In [4]:
with open('../README.md') as fp:
    readme_md = fp.read()
    readme_struct = mdl.load(readme_md)
    readme_elem = mdl.html

In [5]:
nlp = spacy.load('en_core_web_trf')

In [6]:
import dataclasses
dataclasses.astuple(readme_struct)

(0,
 <Element html at 0x10593b880>,
 ((0, <Element body at 0x10597e640>, ()),
  (1,
   <Element h1 at 0x10597e580>,
   ((1, <Element p at 0x14ad6a540>, ()),
    (2, <Element h2 at 0x2bb375c40>, ((2, <Element p at 0x2bb375c00>, ()),)),
    (2,
     <Element h2 at 0x2bb3b5300>,
     ((3,
       <Element h3 at 0x2bb3b5380>,
       ((3, <Element p at 0x2bb3b5440>, ()),
        (4,
         <Element h4 at 0x2bb3b54c0>,
         ((4, <Element ul at 0x2bb3b5540>, ()),)))),
      (3,
       <Element h3 at 0x2bb3b5600>,
       ((3, <Element p at 0x10593b300>, ()),
        (3, <Element p at 0x2bb3b56c0>, ()))),
      (3,
       <Element h3 at 0x2bb3b5780>,
       ((3, <Element p at 0x2bb3b5f80>, ()),
        (3, <Element p at 0x2bb3b5f00>, ()))),
      (3,
       <Element h3 at 0x2bb3b5e40>,
       ((3, <Element p at 0x2bb3b5dc0>, ()),
        (4,
         <Element h4 at 0x2bb3b5d40>,
         ((4, <Element ul at 0x2bb3b5cc0>, ()),)))))),
    (2, <Element h2 at 0x2bb3b5b40>, ((2, <Element p at 0

In [7]:
parent_property = loaders.article.make_parent_property(readme_struct)
timestamp_property = loaders.article.make_timestamp_property(readme_struct)
walk_article = loaders.article.walk_article

for elem in walk_article(readme_struct):
    if len(elem.children) > 0:
        print('_' * 70)
        print(elem.level, elem.elem.text)
        parent = parent_property[elem]
        if parent is not None:
            print(parent.level, parent.elem.text)
        else:
            print(parent)
        print(timestamp_property[elem])


______________________________________________________________________
0 None
None
None
______________________________________________________________________
1 A Network of Ideas
0 None
None
______________________________________________________________________
2 ANOI Web Application
1 A Network of Ideas
None
______________________________________________________________________
2 ANOI Design
1 A Network of Ideas
None
______________________________________________________________________
3 Types
2 ANOI Design
None
______________________________________________________________________
4 Properties
3 Types
None
______________________________________________________________________
3 Strings
2 ANOI Design
None
______________________________________________________________________
3 Media
2 ANOI Design
None
______________________________________________________________________
3 Articles
2 ANOI Design
None
______________________________________________________________________
4 Article Cr

In [8]:
from spacy import displacy

In [9]:
readme_text = '\n'.join(e.elem.text.replace('\n', ' ') for e in walk_article(readme_struct) if e.elem.text is not None)
readme_doc = nlp(readme_text)



In [10]:
len(readme_doc)

1323

In [11]:
tok0 = readme_doc[0]
tok1 = readme_doc[1]
tok2 = readme_doc[2]
tok3 = readme_doc[3]
toks = tok0, tok1, tok2, tok3
[(tok.text, tok.lemma_) for tok in toks]

[('A', 'a'), ('Network', 'Network'), ('of', 'of'), ('Ideas', 'Ideas')]

In [12]:
readme_sents = list(readme_doc.sents)
len(readme_sents), readme_sents[0]

(69, A Network of Ideas)

In [13]:
sent = readme_sents[0]
displacy.render(sent, style='dep')

In [14]:
for index, token in enumerate(readme_sents[0]):
    print(index, token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)
    print()

0 A a DET DT det X True True

1 Network Network PROPN NNP ROOT Xxxxx True False

2 of of ADP IN prep xx True True

3 Ideas Ideas PROPN NNPS pobj Xxxxx True False



In [24]:
# Courtesy https://stackoverflow.com/questions/25698448/how-to-embed-html-into-ipython-output
from IPython.core.display import display, HTML
display_uid = lambda uid = anoi.basis.ANOIReserved.ROOT.value:display(HTML(data=facade.render_uid(uid)))

In [25]:
display_uid()