In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
import spacy
import seaborn as sns
import matplotlib.pyplot as plt
nlp = spacy.load('de_core_news_sm')

parties = [
    'linke',
    'grüne',
    'spd',
    'fdp',
    'cdu_csu',
    'afd',
]

media = [
    'junge Welt',
    "NachDenkSeiten",
    'taz',
    'Süddeutsche Zeitung',
    'stern TV',
    "DER SPIEGEL",
    'ZEIT ONLINE',
    'Der Tagesspiegel',
    'ARD',
    #'tagesschau',
    'ZDF',
    "ZDFheute Nachrichten",
    'Bayerischer Rundfunk',
    'ntv Nachrichten',
    'RTL',
    'FOCUS Online',
    'faz',
    'WELT',
    "BILD",
    'NZZ Neue Zürcher Zeitung',
    "Junge Freiheit",
    'COMPACTTV'
]

search_terms = {
    'cdu':['cdu', 'union'],
    'csu':['csu', 'union'],
    'fdp':['fdp', 'freien demokraten'],
    'grüne':['grünen'],
    'linke':['linke', 'linkspartei'],
    'afd':['afd', 'afg'],
    'spd':['spd', 'sozialdemokraten'],
}

df = pd.read_pickle('../data/topics_combined.pkl')
for party in search_terms.keys():
    df['contains_'+party] = [False for _ in range(len(df.index))]
    for term in search_terms[party]:
        df['contains_'+party] = df['contains_'+party] | df['preprocessed'].str.contains(term)
df['contains_party'] = df['contains_cdu'] | df['contains_csu'] | df['contains_fdp'] | df['contains_grüne'] | df['contains_linke'] | df['contains_afd'] | df['contains_spd'] 

bt = pd.read_pickle('../assets/bundestag.pkl')
bt.drop(columns=['Geburts-jahr', 'Land', 'Listen-platz', 'Erst-stimmen-anteil', 'Listen-platz', 'Wahlkreis', 'BeruflicherHintergrund', 'MdBseit', 'Bemerkungen'], inplace=True)
party_blacklist = [
    'fraktionslos(SSW)',
    'fraktionslos (Zentrum)',
    'fraktionslos (AfD)',
    'fraktionslos (ehemals AfD)',
]
bt = bt[bt['Fraktion(Partei)'].isin(party_blacklist) == False]
bt.loc[bt['Fraktion(Partei)'] == 'CDU/CSU (CDU)', 'Fraktion(Partei)'] = 'cdu'
bt.loc[bt['Fraktion(Partei)'] == 'CDU/CSU (CSU)', 'Fraktion(Partei)'] = 'csu'
politician_dict = bt.set_index('Name').to_dict()['Fraktion(Partei)']
politicians = {
    politician.lower():politician_dict[politician].lower() for politician in list(politician_dict.keys())
}

In [None]:
def standardize_df(input_df):
    df = input_df.copy()
    for party in df.columns:
        df[party] -= df[party].mean()
    return df.round(3)

def columnwise_percentage(input_df):
    df = input_df.copy()
    for medium in df.columns:
        df[medium] /= df[medium].sum()
    return df.round(3)

In [None]:
mentions_by_media = df.groupby(['medium'])[['contains_linke', 'contains_grüne', 'contains_spd', 'contains_fdp', 'contains_csu', 'contains_cdu', 'contains_afd']].sum()
mentions_by_media = mentions_by_media.loc[media].transpose()
mentions_by_media.index = [['linke', 'grüne', 'spd', 'fdp', 'csu', 'cdu', 'afd']]

In [None]:
df_to_plot = columnwise_percentage(mentions_by_media).transpose()
f, ax = plt.subplots(figsize=(9, 8))
sns.heatmap(df_to_plot, annot=True, fmt=".1%", linewidths=.5, ax=ax, center=np.nanmean(df_to_plot))
ax.set(xlabel='party', ylabel='medium', title='percentage of mentions by medium y\nthat are about party x')

df_to_plot = standardize_df(columnwise_percentage(mentions_by_media).transpose())
f, ax = plt.subplots(figsize=(9, 8))
sns.heatmap(df_to_plot, annot=True, fmt=".1%", linewidths=.5, ax=ax, center=np.nanmean(df_to_plot))
ax.set(xlabel='party', ylabel='medium', title='percentage of mentions by medium y\nthat are about party x\nstandardized across media')

In [None]:
stats_df = columnwise_percentage(mentions_by_media).transpose()
print(f"party mentions means:\n{stats_df.mean()}\n\nparty mentions standard deviation:\n{stats_df.std()}")

In [None]:
for party in list(set(politicians.values())):
    df['politician_count_'+party] = [0 for _ in range(df.shape[0])]

for politician in tqdm(politicians.keys()):
    df['politician_count_'+politicians[politician]] += df['preprocessed'].str.contains(politician)

politician_mentions_by_media = df.groupby(['medium'])[['politician_count_linke', 'politician_count_grüne', 'politician_count_spd', 'politician_count_fdp', 'politician_count_csu', 'politician_count_cdu', 'politician_count_afd']].sum()
politician_mentions_by_media = politician_mentions_by_media.loc[media].transpose()
politician_mentions_by_media.index = [['linke', 'grüne', 'spd', 'fdp', 'csu', 'cdu', 'afd']]

In [None]:
df_to_plot = columnwise_percentage(politician_mentions_by_media).transpose()
f, ax = plt.subplots(figsize=(9, 8))
sns.heatmap(df_to_plot, annot=True, fmt=".1%", linewidths=.5, ax=ax, center=np.nanmean(df_to_plot))
ax.set(xlabel='party', ylabel='medium', title='percentage of mentions by medium y\nthat are about politicians of party x')

df_to_plot = standardize_df(columnwise_percentage(politician_mentions_by_media).transpose())
f, ax = plt.subplots(figsize=(9, 8))
sns.heatmap(df_to_plot, annot=True, fmt=".1%", linewidths=.5, ax=ax, center=np.nanmean(df_to_plot))
ax.set(xlabel='party', ylabel='medium', title='percentage of mentions by medium y\nthat are about politicians of party x\nstandardized across media')

In [None]:
stats_df = columnwise_percentage(politician_mentions_by_media).transpose()
print(f"politician mentions means:\n{stats_df.mean()}\n\npolitician mentions standard deviation:\n{stats_df.std()}")

In [None]:
columnwise_percentage(mentions_by_media.loc[['linke', 'grüne', 'spd', 'fdp', 'cdu', 'csu', 'afd']]).transpose().loc[media].to_pickle('../data/mentions/party.pkl')
standardize_df(columnwise_percentage(mentions_by_media.loc[['linke', 'grüne', 'spd', 'fdp', 'cdu', 'csu', 'afd']]).transpose().loc[media]).to_pickle('../data/mentions/party_standardized.pkl')
columnwise_percentage(politician_mentions_by_media.loc[['linke', 'grüne', 'spd', 'fdp', 'cdu', 'csu', 'afd']]).transpose().loc[media].to_pickle('../data/mentions/politician.pkl')
standardize_df(columnwise_percentage(politician_mentions_by_media.loc[['linke', 'grüne', 'spd', 'fdp', 'cdu', 'csu', 'afd']]).transpose().loc[media]).to_pickle('../data/mentions/politician_standardized.pkl')