In [40]:
from collections import Counter
from pathlib import Path
import xml.etree.ElementTree as ET

import pandas as pd
import spacy
from spacy_iwnlp import spaCyIWNLP

In [41]:
tree = ET.parse('btw21_no_lb.xml')

In [42]:
input_data = {}
for child in tree.getroot():
    li = []
    for x in child:
        li.append(x.text)
    input_data[child.attrib['party'].replace('Gruene', 'Grüne')] = '. '.join(li)

In [43]:
data = input_data.values()

In [44]:
nlp = spacy.load("de_core_news_lg")
iwnlp = spaCyIWNLP(lemmatizer_path='IWNLP.Lemmatizer_20181001.json')
nlp.add_pipe(iwnlp)

In [45]:
data = [[t for t in list(nlp(d)) if t.pos_ in ['NOUN', 'PROPN'] and str(t) != '"'] for d in data]

In [46]:
data = [[str(t if t._.iwnlp_lemmas is None else t._.iwnlp_lemmas[0]) for t in d] for d in data]

In [47]:
data = [[t.replace('Deutschlands', 'Deutschland').replace('Europas', 'Europa').replace('Mitgliedsstaat', 'Mitgliedstaat').replace('Elter', 'Eltern') for t in d] for d in data]

In [48]:
cs = [Counter(d) for d in data]

In [49]:
def proper_ranking(c):
    res = {}
    prev_count = -1
    prev_rank = -1
    for rank, pair in enumerate(c.most_common()):
        if pair[1] != prev_count:
            new_rank = rank
        else:
            new_rank: prev_rank
        res[pair[0]] = new_rank
        prev_rank = new_rank
        prev_count = pair[1]
    return res

In [50]:
rankings = [proper_ranking(c) for c in cs]

In [51]:
# choose the top 20 words for each party
s = set()
for r in rankings:
    for w, rank in r.items():
        if rank < 20:
            s.add(w)
        else:
            break

In [52]:
exclude = ['AfD', 'CDU', 'LINKE', 'SPD', '%', 'Demokrat', 'Kapitel', 'Bürger', 'Bürgerin', 'Bürger*innen']

In [53]:
parties = list(input_data.keys())

In [54]:
parties

['CDU', 'SPD', 'Linke', 'Grüne', 'FDP', 'AfD']

In [55]:
final_words = [x for x in s if x not in exclude]

In [56]:
rows = []
for w in final_words:
    for p_idx, r in enumerate(rankings):
        value = -1
        if w in r:
            value = r[w]
        rows.append({'value': value, 'word': w, 'party': parties[p_idx]})
df = pd.DataFrame(rows)

In [57]:
df.to_csv('data.csv', index=False)

In [58]:
df

Unnamed: 0,value,word,party
0,607,Respekt,CDU
1,18,Respekt,SPD
2,1328,Respekt,Linke
3,-1,Respekt,Grüne
4,553,Respekt,FDP
...,...,...,...
289,42,Zusammenarbeit,SPD
290,259,Zusammenarbeit,Linke
291,29,Zusammenarbeit,Grüne
292,83,Zusammenarbeit,FDP


In [59]:
df_comp = df
df_comp['value'] = df_comp['value'].apply(lambda x: 2000 if x == -1 else x)

In [60]:
df_comp

Unnamed: 0,value,word,party
0,607,Respekt,CDU
1,18,Respekt,SPD
2,1328,Respekt,Linke
3,2000,Respekt,Grüne
4,553,Respekt,FDP
...,...,...,...
289,42,Zusammenarbeit,SPD
290,259,Zusammenarbeit,Linke
291,29,Zusammenarbeit,Grüne
292,83,Zusammenarbeit,FDP


In [61]:
w_c = [[df_comp[df_comp['word'] == w]['value'].sum(), w] for w in final_words]

In [62]:
print('", "'.join([x[1] for x in sorted(w_c, key=lambda x: x[0], reverse=True)]))

Volk", "Respekt", "Abschaffung", "Wettbewerb", "Sprache", "Innovation", "Identität", "Prozent", "Chance", "Beschäftigte", "Freiheit", "Zusammenarbeit", "Eltern", "Beispiel", "Zugang", "Welt", "Maßnahme", "Ausbau", "Politik", "Rahmen", "Demokratie", "Euro", "Kommune", "Bereich", "Arbeit", "Möglichkeit", "Zukunft", "Schule", "Familie", "Schutz", "Bund", "Sicherheit", "Leben", "Frau", "Entwicklung", "Wirtschaft", "Recht", "Bildung", "EU", "Europa", "Staat", "Gesellschaft", "Ziel", "Unternehmen", "Jahr", "Kind", "Land", "Mensch", "Deutschland
