## Prerequisites

You need to have the following packages installed and you have to download the spacy model for German, if you want to do the tokenization yourself.
    
        pip install spacy
        python -m spacy download de_core_news_lg

Dataset-Citation:
 Richter, Florian; Koch, Philipp; Franke, Oliver; Kraus, Jakob; Kuruc, Fabrizio; Thiem, Anja; Högerl, Judith; Heine, Stella; Schöps, Konstantin, 2020, "Open Discourse", https://doi.org/10.7910/DVN/FIKIBO, Harvard Dataverse, V3; speeches.csv [fileName]


In [None]:
# you have to run this

import pandas as pd
import plotly
import spacy
from wordcloud import WordCloud, get_single_color_func
from pathlib import Path
import requests

data_path = Path("data")

In [None]:
# This block and the next two are only needed if you want to tokenize the speeches yourself, which takes like an hour.
# If you want to use the pre-tokenized version, skip this block and the next two.

nlp = spacy.load('de_core_news_lg')
if not Path(data_path / 'speeches.csv').is_file():
    url = 'https://dataverse.harvard.edu/api/access/datafile/4745985'
    r = requests.get(url)
    with open('speeches.csv', 'wb') as f:
        f.write(r.content)
    
SPEECHES = pd.read_csv(Path(data_path / 'speeches.csv'), index_col=0)
SPEECHES.tail()

In [None]:
electoral_terms = [18, 19]
current_speeches = SPEECHES[SPEECHES['electoralTerm'].isin(electoral_terms)]

factions = {
    'AFD': 0,
    'Grüne': 3,
    'Union': 4,
    'Linke': 6,
    'FDP': 13,
    'SPD': 23,
}

speeches = []

# warning this takes at least 40 minutes, after that I went to bed

for faction in factions.values():
    print(faction)
    factions_speeches = current_speeches[current_speeches['factionId'] == faction]
    speech_content = factions_speeches['speechContent'].to_list()
    speech_content = [str(speech) for speech in speech_content] # seems sadly necessary
    speech_nlp = [nlp(speech) for speech in speech_content]
    words = [[token for token in speech if token.is_stop != True and token.pos_ == "NOUN"] for speech in speech_nlp]
    words_str = [' '.join([token.text for token in speech]) for speech in words]
    speeches.append(words_str)

In [None]:
def word_frequency(long_str) -> pd.DataFrame:
    """Returns a dataframe with the word frequency of a string"""
    words = []
    for entry in long_str:
        words.extend(entry.split())
    word_freq = {}
    for word in words:
        if word in word_freq:
            word_freq[word] += 1
        else:
            word_freq[word] = 1
    # three columns, index, word, and frequency
    word_freq = pd.DataFrame.from_dict(word_freq, orient='index', columns=['frequency'])
    word_freq = word_freq.sort_values(by='frequency', ascending=False)
    return word_freq

afd_freq = word_frequency(speeches[0]).sort_values(by='frequency', ascending=False)
gruene_freq = word_frequency(speeches[1]).sort_values(by='frequency', ascending=False)
union_freq = word_frequency(speeches[2]).sort_values(by='frequency', ascending=False)
linke_freq = word_frequency(speeches[3]).sort_values(by='frequency', ascending=False)
fdp_freq = word_frequency(speeches[4]).sort_values(by='frequency', ascending=False)
spd_freq = word_frequency(speeches[5]).sort_values(by='frequency', ascending=False)

afd_freq.to_csv(Path(data_path / 'afd_freq.csv'), sep='#')
gruene_freq.to_csv(Path(data_path / 'gruene_freq.csv'), sep='#')
union_freq.to_csv(Path(data_path / 'union_freq.csv'), sep='#')
linke_freq.to_csv(Path(data_path / 'linke_freq.csv'), sep='#')
fdp_freq.to_csv(Path(data_path / 'fdp_freq.csv'), sep='#')
spd_freq.to_csv(Path(data_path / 'spd_freq.csv'), sep='#')

In [None]:
# continue here if you want to use the pre-tokenized version

afd_freq = pd.read_csv(Path(data_path / 'afd_freq.csv'), sep='#', index_col=0)
gruene_freq = pd.read_csv(Path(data_path / 'gruene_freq.csv'), sep='#', index_col=0)
union_freq = pd.read_csv(Path(data_path / 'union_freq.csv'), sep='#', index_col=0)
linke_freq = pd.read_csv(Path(data_path / 'linke_freq.csv'), sep='#', index_col=0)
fdp_freq = pd.read_csv(Path(data_path / 'fdp_freq.csv'), sep='#', index_col=0)
spd_freq = pd.read_csv(Path(data_path / 'spd_freq.csv'), sep='#', index_col=0)

freqs = [afd_freq, gruene_freq, union_freq, linke_freq, fdp_freq, spd_freq]

In [None]:
def normalize_frequency(df_list):
    """
    Normalizes the frequency of a list of dataframes for a faction, so that characterisitcs of a faction can be compared.
    (Frequency by one faction / total words of one faction) / (Frequency by all factions / total words of all factions)
    I'm sure this could've been done in a less convoluted way, PRs are welcome
    """
    total_words = [sum(df['frequency']) for df in df_list]
    total_words_all = sum(total_words)
    normalized = pd.DataFrame()
    afd_norm = []
    gruene_norm = []
    union_norm = []
    linke_norm = []
    fdp_norm = []
    spd_norm = []
    x = 5
    for word in df_list[0].index:
        freq = [df.loc[word]['frequency'] if word in df.index else 0 for df in df_list]
        afd_norm.append((freq[0] + x) / total_words[0] / ((sum(freq) + x) / total_words_all))
        gruene_norm.append(freq[1] / total_words[1] / ((sum(freq) + x) / total_words_all))
        union_norm.append(freq[2] / total_words[2] / ((sum(freq) + x) / total_words_all))
        linke_norm.append(freq[3] / total_words[3] / ((sum(freq) + x) / total_words_all))
        fdp_norm.append(freq[4] / total_words[4] / ((sum(freq) + x) / total_words_all))
        spd_norm.append(freq[5] / total_words[5] / ((sum(freq) + x) / total_words_all))
    normalized['AFD'] = afd_norm
    normalized['Grüne'] = gruene_norm
    normalized['Union'] = union_norm
    normalized['Linke'] = linke_norm
    normalized['FDP'] = fdp_norm
    normalized['SPD'] = spd_norm
    normalized.index = df_list[0].index
    return normalized


normalized = normalize_frequency([afd_freq, gruene_freq, union_freq, linke_freq, fdp_freq, spd_freq])

In [None]:
viz_path = Path('visualizations')

colors = {
    'AFD' : get_single_color_func('SaddleBrown'),
    'Grüne' : get_single_color_func('Green'),
    'Union' : get_single_color_func('darkblue'),
    'Linke' : get_single_color_func('MediumVioletRed'),
    'FDP' : get_single_color_func('Gold'),
    'SPD' : get_single_color_func('red')
}

def word_clouds(df) -> None:
    """
    Creates a word cloud for each faction in a dataframe
    """
    for faction in factions.keys():
        print(faction)
        # don't write words vertically
        wordcloud = WordCloud(width=800, height=400, background_color='white', color_func=colors[faction], prefer_horizontal=1).generate_from_frequencies(df[faction])
        wordcloud.to_file(viz_path / f'{faction}.png')

word_clouds(normalized)