In [2]:
import pandas as pd
import re
from tqdm import tqdm
from dateutil.parser import parse

In [3]:
bt = pd.read_pickle('../assets/bundestag.pkl')
bt.drop(columns=['Geburts-jahr', 'Land', 'Listen-platz', 'Erst-stimmen-anteil', 'Listen-platz', 'Wahlkreis', 'BeruflicherHintergrund', 'MdBseit', 'Bemerkungen'], inplace=True)
party_blacklist = [
    'fraktionslos(SSW)',
    'fraktionslos (Zentrum)',
    'fraktionslos (AfD)',
    'fraktionslos (ehemals AfD)',
]
bt = bt[bt['Fraktion(Partei)'].isin(party_blacklist) == False]
bt.loc[bt['Fraktion(Partei)'] == 'CDU/CSU (CDU)', 'Fraktion(Partei)'] = 'cdu'
bt.loc[bt['Fraktion(Partei)'] == 'CDU/CSU (CSU)', 'Fraktion(Partei)'] = 'csu'
politician_dict = bt.set_index('Name').to_dict()['Fraktion(Partei)']
politicians = {
    politician.lower():politician_dict[politician].lower() for politician in list(politician_dict.keys())
}

In [4]:
search_terms = {'die linke':'linke',
                'linkspartei':'linke',
                'die grünen':'grüne',
                'spd':'spd',
                'freien demokraten':'fdp',
                'fdp':'fdp',
                'cdu':'cdu',
                'csu':'csu',
                'alternative für deutschland':'afd',
                'afd':'afd', 
                'afg':'afd',
                'trump':'trump',
                }

In [6]:
# read data, merge zdf, drop irrelevant media
media_to_consider = ['NachDenkSeiten', 'taz', 'DER SPIEGEL', 'ARD', 'ZDF', 'Bayerischer Rundfunk', 'ntv Nachrichten', 'faz', 'WELT', 'BILD', 'COMPACTTV']
df = pd.read_pickle('../data/topics_combined.pkl')
df.loc[df['medium'] == 'ZDFinfo Dokus & Reportagen', 'medium'] = 'ZDF'
df.loc[df['medium'] == 'ZDFheute Nachrichten', 'medium'] = 'ZDF'
df = df[df['medium'].isin(media_to_consider)]

In [7]:
mention_df = pd.DataFrame(columns=['medium', 'id', 'title', 'minute', 'transcript', 'date', 'search_term', 'extracted_string'])

In [8]:
def extract_string(transcript, search_term):
  # Use a regular expression to find all occurrences of the search term in the transcript
  pattern = r"(?i)\b" + re.escape(search_term) + r"\b"
  matches = re.finditer(pattern, transcript)

  # For each occurrence, extract a 21 word long string with the search term in the middle
  extracted_strings = []
  for match in matches:
    start_index = match.start()
    end_index = match.end()

    # Split the transcript into words
    words_before = transcript[:start_index].split()
    words_after = transcript[end_index:].split()
    if len(words_before) < 10:
      return
    if len(words_after) < 10:
      return


    # Extract the 21 word long string
    string = ' '.join(words_before[-10:] + [search_term] + words_after[:10])
    extracted_strings.append(string)

  return extracted_strings


# Create an empty list to store the rows of the new DataFrame
rows = []

# Iterate over the rows of the original DataFrame
for _, row in tqdm(df.iterrows(), total=df.shape[0]):
  medium = row['medium']
  id = row['id']
  title = row['title']
  minute = row['minute']
  transcript = row['transcript']
  date = row['date']


  # For each search term, extract the relevant strings and add a row to the new DataFrame for each occurrence
  for term in search_terms.keys():
    extracted_strings = extract_string(transcript, term)
    if extracted_strings:
      for extracted_string in extracted_strings:
        rows.append({'medium': medium, 'id': id, 'title': title, 'minute': minute, 'date': date, 'search_term': term, 'extracted_string': extracted_string})

# Create the new DataFrame from the list of rows
party_df = pd.DataFrame(rows, columns=['medium', 'id', 'title', 'minute', 'date', 'search_term', 'extracted_string'])

100%|██████████| 868694/868694 [02:05<00:00, 6931.48it/s]


In [9]:
party_df['party'] = party_df['search_term'].apply(lambda x: search_terms[x])

In [10]:
party_df['date'] = party_df.date.apply(parse)

In [13]:
party_df.search_term.value_counts()

spd                            28808
cdu                            28157
fdp                            19041
trump                          13655
csu                            10641
die grünen                     10225
afg                             7003
afd                             3920
die linke                       3072
linkspartei                     1817
freien demokraten                335
alternative für deutschland      135
Name: search_term, dtype: int64

In [14]:
party_df.to_pickle('../data/mentions/party_mentions.pkl')

In [15]:
# Create an empty list to store the rows of the new DataFrame
rows = []

# Iterate over the rows of the original DataFrame
for _, row in tqdm(df.iterrows(), total=df.shape[0]):
  medium = row['medium']
  id = row['id']
  title = row['title']
  minute = row['minute']
  transcript = row['transcript']
  date = row['date']


  # For each search term, extract the relevant strings and add a row to the new DataFrame for each occurrence
  for politician in politicians.keys():
    extracted_strings = extract_string(transcript, politician)
    if extracted_strings:
      for extracted_string in extracted_strings:
        rows.append({'medium': medium, 'id': id, 'title': title, 'minute': minute, 'date': date, 'search_term': politician, 'extracted_string': extracted_string})

# Create the new DataFrame from the list of rows
politician_df = pd.DataFrame(rows, columns=['medium', 'id', 'title', 'minute', 'date', 'search_term', 'extracted_string'])

100%|██████████| 868694/868694 [6:10:37<00:00, 39.06it/s]   


In [16]:
politician_df['party'] = politician_df['search_term'].apply(lambda x: politicians[x])

In [19]:
politician_df['date'] = politician_df.date.apply(parse)

In [20]:
politician_df

Unnamed: 0,medium,id,title,minute,date,search_term,extracted_string,party
0,ARD,Monitor,studioM: Grüne Kompromisse - Hauptsache Macht?,1,2021-02-12,lisa badum,diskutieren wir heute natürlich auch mit einer...,grüne
1,ARD,Monitor,studioM: Grüne Kompromisse - Hauptsache Macht?,5,2021-02-12,annalena baerbock,"ist. Und es steht außer Frage, dass die Grünen...",grüne
2,ARD,Monitor,studioM: Grüne Kompromisse - Hauptsache Macht?,14,2021-02-12,robert habeck,"sein. Auf der Pressekonferenz, wo der Koalitio...",grüne
3,ARD,Monitor,studioM: Grüne Kompromisse - Hauptsache Macht?,25,2021-02-12,annalena baerbock,der Außengrenze an der EU-Außengrenze zu rügen...,grüne
4,ARD,Monitor,studioM: Grüne Kompromisse - Hauptsache Macht?,31,2021-02-12,paul ziemiak,"so, dass in einigen Bereichen da durchaus Verb...",cdu
...,...,...,...,...,...,...,...,...
56845,BILD,bXREDY4a63w,HSV: Timo Kraus bleibt weiterhin verschwunden ...,3,2017-01-27,michael kruse,auschwitz birkenau haben heute holocaust-überl...,fdp
56846,BILD,jhrEur2n-_E,Pressekonferenz zum Rücktritt Gabriels - BILD-...,3,2017-01-24,olaf scholz,das präsidium einstimmig meinem vorschlag gefo...,spd
56847,BILD,F1cNhUoIF38,Donald Trump hat ein falsches Bild von der EU ...,5,2017-01-16,jens spahn,wirklich über twitter kommuniziert wieder zu m...,cdu
56848,BILD,sYWsBZvjtUc,Flüchtlingen in Belgrad droht der Kältetod - B...,15,2017-01-11,olaf scholz,anderem unser bundespräsident herr gauck ist d...,spd


In [21]:
politician_df.search_term.value_counts()

olaf scholz          13114
armin laschet         7175
friedrich merz        4250
annalena baerbock     4040
karl lauterbach       3197
                     ...  
ralph edelhäußer         1
leon eckert              1
jan dieren               1
anna christmann          1
heike brehmer            1
Name: search_term, Length: 477, dtype: int64

In [None]:
politician_df.to_pickle('../data/mentions/politician_mentions.pkl')

In [17]:
politician_df.groupby(['search_term', 'medium', 'id', 'title','date', 'party']).size().reset_index(name='mention_count').to_pickle('../data/mentions/politician_mentions_unique.pkl')

In [23]:
'''mention_count = politician_df.groupby(['search_term', 'medium', 'id', 'title']).size()
mention_count = mention_count.reset_index(name='mention_count')
unique_politician_df = mention_count.drop(columns=['id', 'title']).groupby(['medium', 'search_term']).count().sort_values('mention_count', ascending=False).reset_index()
unique_politician_df['party'] = unique_politician_df['search_term'].apply(lambda x: politicians[x])'''

In [24]:
unique_politician_df

Unnamed: 0,medium,search_term,mention_count,party
0,WELT,olaf scholz,1057,spd
1,BILD,olaf scholz,766,spd
2,ZDF,olaf scholz,708,spd
3,WELT,armin laschet,634,cdu
4,BILD,armin laschet,485,cdu
...,...,...,...,...
1651,ZDF,falko droßmann,1,spd
1652,ZDF,florian müller,1,cdu
1653,ZDF,frank bsirske,1,grüne
1654,DER SPIEGEL,carsten linnemann,1,cdu


In [25]:
unique_politician_df.to_pickle('../data/mentions/politician_mentions_unique.pkl')

In [3]:
party_df = pd.read_pickle('../data/mentions/party_mentions.pkl')

In [None]:
party_df.groupby(['search_term', 'medium', 'id', 'title','date', 'party']).size().reset_index(name='mention_count').to_pickle('../data/mentions/party_mentions_unique.pkl')