In [None]:
text = """Figure 7:  Schematic inheritance relations and properties for the top-level Self-Description  
Schemas.

© BMWi

132  CORE ARCHITECTURE ELEMENTS

2.5 Catalogue

The concept Self-Description is the foundation of the 
federated GAIA-X Catalogues. Catalogues are the 
main building block for the publication and discovery 
of Self-Descriptions of Assets and Participants. To sat-
isfy Consumer needs and to objectively find the best 
fitting offerings in the tangle of registered Assets, an 
open and transparent query algorithm is implemented 
without any GAIA-X internal ranking. Beside search 
functionality, a graph-based navigation interface is 
provided to traverse the complex tangle of offered 
Services, Nodes and linked Self-Descriptions, includ-
ing the attached claims with chain of trust statements. 
Consumers can verify each Self-Description individu-
ally and decide which one to select in a self-sovereign 
manner – GAIA-X does not act as a runtime interme-
diary or broker.
"""

In [None]:
!pip install textacy
!python -m spacy download de_core_news_lg

In [None]:
import textacy.preprocessing
text_hyphen = textacy.preprocessing.normalize.hyphenated_words(text)
print(text_hyphen)

In [None]:
print(textacy.preprocessing.normalize.whitespace(text_hyphen))

In [None]:
p1 = "Doch das Ende des Jahres 2020 birgt auch Hoffnung, dass durch die Vakzinen \
gegen Covid-19 wieder Normalität einkehre – wie immer die auch aussehen mag \
– und wir uns um anderes Dringliches kümmern oder einfach entspannen \
können. Und dass durch den im Januar anstehenden Bewohnerwechsel im \
Weißen Haus zu Washington D.C. das offizielle Herumgetrumpel auf dem \
gesunden Menschenverstand ein Ende finden möge."

In [None]:
list(textacy.extract.kwic.keyword_in_context(p1, "Hoffnung", window_width=35))

In [None]:
de = textacy.load_spacy_lang("de_core_news_lg")
doc = textacy.make_spacy_doc(p1, lang=de)
doc._.preview

In [None]:
list(textacy.extract.ngrams(doc, 3, filter_stops=True, filter_punct=True, 
                            filter_nums=False))

In [None]:
list(textacy.extract.entities(doc, drop_determiners=True))

In [None]:
list(textacy.extract.noun_chunks(doc))

In [None]:
import textacy.extract.keyterms
textacy.extract.keyterms.textrank(doc, normalize="lemma", topn=10)

In [None]:
from collections import Counter
bot = Counter(doc._.to_bag_of_terms(ngs=(1, 2, 3), ents=True, 
                                    weighting="count")) #, as_strings=True))
bot.most_common(10)

In [None]:
import sys, os
ON_COLAB = 'google.colab' in sys.modules

if ON_COLAB:
    os.system("test -f heise-articles-2020.db || wget  https://datanizing.com/heiseacademy/nlp-course/blob/main/99_Common/heise-articles-2020.db.gz && gunzip heise-articles-2020.db.gz")
    newsticker_db = 'heise-articles-2020.db'
else:
    newsticker_db = '../99_Common/heise-articles-2020.db'

In [None]:
# 100 Dokumente selektieren
import sqlite3 
import pandas as pd

sql = sqlite3.connect(newsticker_db)
df = pd.read_sql("SELECT * FROM articles \
                  ORDER BY datePublished DESC LIMIT 100", sql)
df["full_text"] = df["title"] + "\n" + df["header"] + "\n" + df["text"]

In [None]:
import textacy
corpus = textacy.Corpus("de_core_news_lg", 
                        data = list(df["full_text"].values))

In [None]:
corpus.n_docs, corpus.n_sents, corpus.n_tokens

In [None]:
word_counts = Counter(corpus.word_counts(by="lemma_"))
word_counts.most_common(10)

In [None]:
import textacy.datasets
ds = textacy.datasets.CapitolWords()

In [None]:
ds.download()
records = ds.records(speaker_name={"Hillary Clinton", "Barack Obama"})

In [None]:
list(records)