In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import pandas as pd
import numpy as np
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=False)
from tqdm import tqdm
tqdm.pandas()

media = [
    'junge Welt',
    "NachDenkSeiten",
    'taz',
    'Süddeutsche Zeitung',
    'stern TV',
    "DER SPIEGEL",
    'Der Tagesspiegel',
    'ARD',
    'tagesschau',
    'ZDF',
    "ZDFheute Nachrichten",
    'Bayerischer Rundfunk',
    'ntv Nachrichten',
    'RTL',
    'FOCUS Online',
    'ZEIT ONLINE',
    'faz',
    'WELT',
    "BILD",
    'NZZ Neue Zürcher Zeitung',
    "Junge Freiheit",
    'COMPACTTV'
]

search_terms = {
    'cdu':['cdu'],
    'csu':['csu'],
    'fdp':['fdp'],
    'grüne':['grüne'],
    'linke':['linke'],
    'afd':['afd', 'afg'],
    'spd':['spd'],
}

df = pd.read_pickle('../data/topic.pkl')
df['contains_cdu'] = df['preprocessed'].str.contains('cdu')
df['contains_csu'] = df['preprocessed'].str.contains('csu')
df['contains_fdp'] = df['preprocessed'].str.contains('fdp')
df['contains_grüne'] = df['preprocessed'].str.contains('grüne') 
df['contains_linke'] = df['preprocessed'].str.contains('linke')
df['contains_afd'] = df['preprocessed'].str.contains('afd') | df['preprocessed'].str.contains('afg')
df['contains_spd'] = df['preprocessed'].str.contains('spd')

def extract_party_mentions(input_string, party_strings, n_words=10):
    party_boolean = [False for i in input_string.split()]
    for p in party_strings:
        temp = [string.__contains__(p) for string in input_string.split()]
        party_boolean = [party_boolean|temp for (party_boolean,temp) in zip(party_boolean, temp)]
    party_index = np.where(party_boolean)[0]
    return [" ".join(input_string.split()[party_index[i]-n_words:party_index[i]+(n_words+1)]) for i in range(len(party_index))]

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [2]:
tokenizer = AutoTokenizer.from_pretrained("mdraw/german-news-sentiment-bert")
model = AutoModelForSequenceClassification.from_pretrained("mdraw/german-news-sentiment-bert")
classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
classifier("Dies ist ein schlechter Test.")

[{'label': 'negative', 'score': 0.988132655620575}]

In [3]:
ind = 4
party = 'grüne'

subset = 'contains_' + party
teststring = df.loc[df[subset]].iloc[ind]['transcript']
extracted_strings = extract_party_mentions(input_string=teststring, party_strings=search_terms[party])
print(f'Strings: {extracted_strings}')
print(f'Classification: {classifier(extracted_strings)}')

Strings: ['mal einen großen streit tag bei future nun sind die grünen in der bundesregierung wenn es nach der ampel geht soll', 'wie zu erwarten war aus viele jugendliche sehen in den grünen jedoch ein fortschritt wie geht']
Classification: [{'label': 'negative', 'score': 0.670270562171936}, {'label': 'neutral', 'score': 0.46254244446754456}]


In [4]:
def extract_mention_df(party):
    subset = 'contains_' + party
    res_series = df['transcript'].loc[df[subset]].parallel_apply(lambda transcript: extract_party_mentions(input_string=transcript, party_strings=search_terms[party], n_words=10))
    temp = {'medium': df['medium'].loc[df[subset]], 'transcript':res_series}
    res_df = pd.DataFrame(temp).explode(column='transcript')
    res_df.reset_index(inplace=True)
    res_df.dropna(inplace=True)
    res_df.drop(res_df.index[res_df['transcript'] == ''], inplace=True)
    return res_df

In [5]:
def extract_sentiment_df(input_df):
    input_df['sentiment'] = input_df['transcript'].progress_apply(classifier)
    input_df['positive'] = [True if sent[0]['label']=='positive' else False for sent in input_df['sentiment']]
    input_df['neutral'] = [True if sent[0]['label']=='neutral' else False for sent in input_df['sentiment']]
    input_df['negative'] = [True if sent[0]['label']=='negative' else False for sent in input_df['sentiment']]
    input_df['score'] = [sent[0]['score'] for sent in input_df['sentiment']]
    return input_df

In [6]:
def extract_avg_sentiment_df(input_df):
    avg_pos = input_df[input_df['positive']].groupby(['medium'])['score'].mean()
    avg_neu = input_df[input_df['neutral']].groupby(['medium'])['score'].mean()
    avg_neg = input_df[input_df['negative']].groupby(['medium'])['score'].mean()
    output_df = pd.DataFrame(data={'avg_pos':avg_pos, 'avg_neu':avg_neu, 'avg_neg':avg_neg})
    return output_df

In [7]:
def get_avg_party_sentiment(party):
    df = extract_mention_df(party)
    df = extract_sentiment_df(df)
    df = extract_avg_sentiment_df(df)
    avg = df['avg_pos'] - df['avg_neg']
    return avg

In [8]:
sentiment_dict = {}
for party in search_terms.keys():
    sentiment_dict[party] = get_avg_party_sentiment(party)

100%|██████████| 33238/33238 [35:37<00:00, 15.55it/s] 
100%|██████████| 11875/11875 [12:47<00:00, 15.48it/s]
100%|██████████| 21404/21404 [20:38<00:00, 17.28it/s]
100%|██████████| 37298/37298 [36:34<00:00, 17.00it/s]
100%|██████████| 17777/17777 [16:52<00:00, 17.55it/s] 
100%|██████████| 25391/25391 [25:29<00:00, 16.60it/s]
100%|██████████| 33166/33166 [36:24<00:00, 15.18it/s]  
