In [1]:
import spacy
import pandas as pd
nlp = spacy.load('de_core_news_sm')

political_vocabulary = [
    'freiheit',
    'gleich',
    'solidar',
    'sozial',
    'privatisierung',
    'leistung',
    'heimat',
    'mitbestimmung',
    'nachhaltig',
    'umwelt',
    'armut',
    'diskriminierung',
    'demokratisierung',
    'digital',
]

parties = [
       'afd',
       'cdu_csu',
       'fdp',
       'spd',
       'grüne',
       'linke',
]

media = [
    'junge Welt',
    "NachDenkSeiten",
    'taz',
    'Süddeutsche Zeitung',
    'stern TV',
    "DER SPIEGEL",
    'Der Tagesspiegel',
    'ARD',
    'Tagesschau',
    'ZDF',
    "ZDFheute Nachrichten",
    'Bayerischer Rundfunk',
    'ntv Nachrichten',
    'RTL',
    'FOCUS Online',
    'ZEIT ONLINE',
    'faz',
    'WELT',
    "BILD",
    'NZZ Neue Zürcher Zeitung',
    "Junge Freiheit",
    'COMPACTTV'
]

In [2]:
def extract_party_phrase_counts(party):
    with open(f"../assets/manifests/{party}_manifest.txt", encoding="utf-8", errors="ignore") as d:
        manifest = d.read()
    with nlp.select_pipes(enable="lemmatizer"):
        doc = nlp(manifest)
    lemmas = [token.lemma_.lower() for token in doc]
    manifest_vocabulary = [lemma for lemma in lemmas if lemma.isalpha()]
    phrase_counts = {phrase: 0 for phrase in political_vocabulary}
    for political_phrase in political_vocabulary:
        for manifest_phrase in manifest_vocabulary:
            if manifest_phrase.__contains__(political_phrase):
                phrase_counts[political_phrase] += 1
    phrase_counts['total'] = len(manifest_vocabulary)
    return pd.Series(phrase_counts)

In [3]:
def extract_medium_phrase_counts(df, medium):
    phrase_counts = {phrase: 0 for phrase in political_vocabulary}
    for doc in df[df['medium'] == medium]['transcript']:
        with nlp.select_pipes(enable="lemmatizer"):
            preprocessed = nlp(doc)
        lemmas = [token.lemma_.lower() for token in preprocessed]
        medium_vocabulary = [lemma for lemma in lemmas if lemma.isalpha()]
        for political_phrase in political_vocabulary:
            for medium_phrase in medium_vocabulary:
                if medium_phrase.__contains__(political_phrase):
                    phrase_counts[political_phrase] += 1
    return pd.Series(phrase_counts)

In [4]:
phrase_dict = {
    party:extract_party_phrase_counts(party) for party in parties
}
pd.DataFrame(phrase_dict)

Unnamed: 0,afd,cdu_csu,fdp,spd,grüne,linke
freiheit,62,49,70,23,46,54
gleich,71,103,84,77,207,203
solidar,5,7,7,33,16,96
sozial,94,89,69,85,147,410
privatisierung,3,0,0,2,2,21
leistung,70,103,50,47,72,100
heimat,16,14,2,2,6,3
mitbestimmung,2,3,0,6,10,28
nachhaltig,4,72,43,41,110,38
umwelt,8,17,17,18,56,43


In [None]:
df = pd.read_pickle('../data/topic.pkl')
phrase_dict = {
    medium:extract_medium_phrase_counts(df, medium) for medium in media
}
pd.DataFrame(phrase_dict)

In [9]:
extract_medium_phrase_counts(df, 'WELT')