In [None]:
!pip install textacy
!python -m spacy download de_core_news_lg

In [None]:
import spacy

nlp = spacy.load("de_core_news_lg")

In [None]:
import sys, os
ON_COLAB = 'google.colab' in sys.modules

if ON_COLAB:
    os.system("test -f heise-articles-2020.db || wget  https://datanizing.com/heiseacademy/nlp-course/blob/main/99_Common/heise-articles-2020.db.gz && gunzip heise-articles-2020.db.gz")
    newsticker_db = 'heise-articles-2020.db'
else:
    newsticker_db = '../99_Common/heise-articles-2020.db'

In [None]:
import sqlite3 
import pandas as pd

sql = sqlite3.connect(newsticker_db)
df = pd.read_sql("SELECT * FROM articles ORDER BY datePublished", sql, index_col="id", parse_dates=["datePublished"])
df["full_text"] = df["title"] + "\n" + df["header"] + "\n" + df["text"]

In [None]:
from tqdm.auto import tqdm

# einen einzelnen Satz analysieren
def analyze_sentence(sent):
    nouns = []
    adjectives = []
    verbs = []
    lemmas = []
    nav = []
    
    for token in sent:
        if token.pos_ == "NOUN" or token.pos_ == "PROPN":
            nouns.append(token.lemma_ )
            nav.append(token.lemma_ )
        if token.pos_ == "ADJ" or token.pos_ == "ADV":
            adjectives.append(token.lemma_ )
            nav.append(token.lemma_)
        if token.pos_ == "VERB" or token.pos_ == "AUX":
            verbs.append(token.lemma_ )
            nav.append(token.lemma_)
        lemmas.append(token.lemma_)    
        
    return (nouns, adjectives, verbs, lemmas, nav, 
            [str(e) for e in sent.ents], [str(nc) for nc in sent.noun_chunks])

# die einzelnen Teil der Sätze wieder miteinander verbinden
def resentence(words):
    # Sätze kannst du nicht mit "." verbinden, weil das auch ein Teil einer Entität sein kann
    # bei "#" ist das deutlich unwahrscheinlicher
    # Wörter verbindest du wie gewohnt mit "|"
    # leere "Sätze" werden ignoriert (also solche ohne Entitäten oder Adjektive)
    return "#".join(["|".join([w for w in sent_words])
                                    for sent_words in words if len(sent_words) > 0])

# Iteration über den gesamten Dataframe
for i, r in tqdm(df.iterrows(), total=len(df)):
    doc = nlp(str(r["full_text"]))
    nouns = []
    adjectives = []
    verbs = []
    lemmas = []
    nav = []
    entities = []
    noun_chunks = []
    for sentence in doc.sents:
        # Satz analysieren
        (sent_nouns, sent_adjectives, sent_verbs, sent_lemmas, sent_nav,
         sent_entities, sent_noun_chunks) = analyze_sentence(sentence)
            
        # Werte für jeden Satz speichern
        nouns.append(sent_nouns)
        adjectives.append(sent_adjectives)
        verbs.append(sent_verbs)
        nav.append(sent_nav)
        lemmas.append(sent_lemmas)
        entities.append(sent_entities)
        noun_chunks.append(sent_noun_chunks)
      
    # zusammengesetzte Sätze abspeichern
    df.at[i, "nouns"]       = resentence(nouns)
    df.at[i, "adjectives"]  = resentence(adjectives)
    df.at[i, "verbs"]       = resentence(verbs)
    df.at[i, "lemmas"]      = resentence(lemmas)
    df.at[i, "nav"]         = resentence(nav)
    df.at[i, "entities"]    = resentence(entities)
    df.at[i, "noun_chunks"] = resentence(noun_chunks)

    df.at[i, "no_tokens"]      = df.at[i, "lemmas"].count("|") + 1
    df.at[i, "no_sentences"]   = len(lemmas)
    df.at[i, "no_noun_chunks"] = df.at[i, "noun_chunks"].count("|") + 1

In [None]:
df.to_sql("nlp_articles", sql, if_exists="replace")