In [1]:
from collections import Counter
from pathlib import Path
import xml.etree.ElementTree as ET

import pandas as pd
import simplemma
import spacy
from spacy_iwnlp import spaCyIWNLP

In [2]:
tree = ET.parse('btw21_no_lb.xml')

In [3]:
input_data = {}
for child in tree.getroot():
    li = []
    for x in child:
        li.append(x.text)
    input_data[child.attrib['party'].replace('Gruene', 'Grüne')] = '. '.join(li)

In [4]:
data = input_data.values()

In [5]:
nlp = spacy.load("de_core_news_lg")
iwnlp = spaCyIWNLP(lemmatizer_path='IWNLP.Lemmatizer_20181001.json')
nlp.add_pipe(iwnlp)

In [6]:
data = [[t for t in list(nlp(d)) if t.pos_ in ['NOUN', 'PROPN'] and str(t) != '"'] for d in data]

In [7]:
data = [[str(t if t._.iwnlp_lemmas is None else t._.iwnlp_lemmas[0]) for t in d] for d in data]

In [8]:
import simplemma
langdata = simplemma.load_data('de', 'en')
data = [[simplemma.lemmatize(t, langdata, greedy=False) for t in d] for d in data]

In [9]:
data = [[t.replace('Mitgliedsstaat', 'Mitgliedstaat').replace('Elter', 'Eltern') for t in d] for d in data]

In [10]:
cs = [Counter(d) for d in data]

In [11]:
exclude = ['AfD', 'CDU', 'LINKE', 'SPD', '%', 'Demokrat', 'Kapitel', 'Bürger', 'Bürgerin', 'Bürger*innen']

In [12]:
for x in exclude:
    for c in cs:
        del c[x]

In [13]:
def proper_ranking(c):
    res = {}
    prev_count = -1
    prev_rank = -1
    for rank, pair in enumerate(c.most_common()):
        if pair[1] != prev_count:
            new_rank = rank
        else:
            new_rank: prev_rank
        res[pair[0]] = new_rank
        prev_rank = new_rank
        prev_count = pair[1]
    return res

In [14]:
rankings = [proper_ranking(c) for c in cs]

In [15]:
# choose the top 20 words for each party
s = set()
for r in rankings:
    for w, rank in r.items():
        if rank < 20:
            s.add(w)
        else:
            break

In [16]:
parties = list(input_data.keys())

In [17]:
parties

['CDU', 'SPD', 'Linke', 'Grüne', 'FDP', 'AfD']

In [18]:
final_words = [x for x in s if x not in exclude]

In [19]:
rows = []
for w in final_words:
    for p_idx, r in enumerate(rankings):
        value = -1
        if w in r:
            value = r[w]
        rows.append({'value': value, 'word': w, 'party': parties[p_idx]})
df = pd.DataFrame(rows)

In [20]:
df.to_csv('data.csv', index=False)

In [21]:
df

Unnamed: 0,value,word,party
0,56,Arbeit,CDU
1,5,Arbeit,SPD
2,4,Arbeit,Linke
3,16,Arbeit,Grüne
4,27,Arbeit,FDP
...,...,...,...
313,22,Bildung,SPD
314,13,Bildung,Linke
315,21,Bildung,Grüne
316,8,Bildung,FDP


In [22]:
df_comp = df
df_comp['value'] = df_comp['value'].apply(lambda x: 2000 if x == -1 else x)

In [23]:
df_comp

Unnamed: 0,value,word,party
0,56,Arbeit,CDU
1,5,Arbeit,SPD
2,4,Arbeit,Linke
3,16,Arbeit,Grüne
4,27,Arbeit,FDP
...,...,...,...
313,22,Bildung,SPD
314,13,Bildung,Linke
315,21,Bildung,Grüne
316,8,Bildung,FDP


In [24]:
w_c = [[df_comp[df_comp['word'] == w]['value'].sum(), w] for w in final_words]

In [25]:
print('", "'.join([x[1] for x in sorted(w_c, key=lambda x: x[0], reverse=True)]))

Volk", "Respekt", "Abschaffung", "Wettbewerb", "Sprache", "Innovation", "Identität", "Deutschlands", "Prozent", "Landwirtschaft", "Chance", "Beschäftigte", "Freiheit", "Zusammenarbeit", "Eltern", "Beispiel", "Zugang", "Einkommen", "Welt", "Maßnahme", "Ausbau", "Politik", "Rahmen", "Demokratie", "Euro", "Kommune", "Bereich", "Förderung", "Arbeit", "Zukunft", "Möglichkeit", "Schule", "Familie", "Schutz", "Bund", "Sicherheit", "Leben", "Frau", "Entwicklung", "Wirtschaft", "Recht", "Bildung", "Europa", "EU", "Staat", "Gesellschaft", "Ziel", "Unternehmen", "Jahr", "Kind", "Land", "Deutschland", "Mensch
