In [1]:
import pandas as pd
import re

In [2]:
search_terms = {'die linke':'linke',
                'linkspartei':'linke',
                'die grünen':'grüne',
                'spd':'spd',
                'freien demokraten':'fdp',
                'fdp':'fdp',
                'cdu':'cdu',
                'csu':'csu',
                'alternative für deutschland':'afd',
                'afd':'afd', 
                'afg':'afd'}

In [3]:
# read data, merge zdf, drop irrelevant media
media_to_consider = ['NachDenkSeiten', 'taz', 'DER SPIEGEL', 'ARD', 'ZDF', 'Bayerischer Rundfunk', 'ntv Nachrichten', 'faz', 'WELT', 'BILD', 'COMPACTTV']
df = pd.read_pickle('../data/combined.pkl')
df.loc[df['medium'] == 'ZDFinfo Dokus & Reportagen', 'medium'] = 'ZDF'
df.loc[df['medium'] == 'ZDFheute Nachrichten', 'medium'] = 'ZDF'
df = df[df['medium'].isin(media_to_consider)]

In [4]:
mention_df = pd.DataFrame(columns=['medium', 'id', 'title', 'minute', 'transcript', 'search_term', 'extracted_string'])

In [5]:
def extract_string(transcript, search_term):
  # Use a regular expression to find all occurrences of the search term in the transcript
  pattern = r"(?i)\b" + re.escape(search_term) + r"\b"
  matches = re.finditer(pattern, transcript)

  # For each occurrence, extract a 21 word long string with the search term in the middle
  extracted_strings = []
  for match in matches:
    start_index = match.start()
    end_index = match.end()

    # Split the transcript into words
    words_before = transcript[:start_index].split()
    words_after = transcript[end_index:].split()
    if len(words_before) < 10:
      return
    if len(words_after) < 10:
      return


    # Extract the 21 word long string
    string = ' '.join(words_before[-10:] + [search_term] + words_after[:10])
    extracted_strings.append(string)

  return extracted_strings


# Create an empty list to store the rows of the new DataFrame
rows = []

# Iterate over the rows of the original DataFrame
for _, row in df.iterrows():
  medium = row['medium']
  id = row['id']
  title = row['title']
  minute = row['minute']
  transcript = row['transcript']


  # For each search term, extract the relevant strings and add a row to the new DataFrame for each occurrence
  for term in search_terms.keys():
    extracted_strings = extract_string(transcript, term)
    if extracted_strings:
      for extracted_string in extracted_strings:
        rows.append({'medium': medium, 'id': id, 'title': title, 'minute': minute, 'search_term': term, 'extracted_string': extracted_string})

# Create the new DataFrame from the list of rows
new_df = pd.DataFrame(rows, columns=['medium', 'id', 'search_term', 'extracted_string'])

In [6]:
new_df.search_term.value_counts()

spd                            28808
cdu                            28157
fdp                            19041
csu                            10641
die grünen                     10225
afg                             7003
afd                             3920
die linke                       3072
linkspartei                     1817
freien demokraten                335
alternative für deutschland      135
Name: search_term, dtype: int64

In [14]:
test = new_df.loc[new_df['search_term'] == 'die grünen', 'extracted_string'].sample(50)