In [328]:
import pysrt
import re
import os
import glob
import pandas as pd
from tqdm import tqdm
import spacy
from pandarallel import pandarallel

clean_pattern = re.compile(r"<font color=\"#[0123456789ABCDEF]{6}\">|</font>", re.DOTALL)
date_pattern = re.compile(r"Datum:\s+([0123][0123456789].[01][0123456789].20[012][0123456789])(?=\n)", re.DOTALL)
description_pattern = re.compile(r"(?:.m3u8\n\n|.mp4\n\n)(.*)(?=\n\n|\n)", re.DOTALL)
title_pattern = re.compile(r"Titel:\s+(.*)(?=\n\nDatum)", re.DOTALL)
duration_pattern = re.compile(r"Dauer:\s+(.*)(?=\n\nTitel|\n\n\nWebsite|\nGröße)", re.DOTALL)
channel_pattern = re.compile(r"Sender:\s+([AZ][RD][DF](?=\n))", re.DOTALL)

In [329]:
def define_print(verbose=True):
    if verbose:
        verboseprint = print
    else:
        verboseprint = lambda *args: None
    return verboseprint

def load_filter():
    nlp = spacy.load("de_core_news_sm")
    filterwords = spacy.lang.de.stop_words.STOP_WORDS
    with open("../docs/filterwords.txt", encoding="utf-8", errors="ignore") as d:
        filterwords.update(d.read().split())
    with open("../docs/german_stopwords_full.txt", encoding="utf-8", errors="ignore") as d:
        filterwords.update(d.read().split()[53:])
    return list(set(filterwords))

def lemmatize(text, nlp, filterwords):
    """
    tokenizes and lemmatizes german input text
    :param text: raw input text (german)
    :return: list of lemmatized tokens from input text
    """

    with nlp.select_pipes(enable="lemmatizer"):
        doc = nlp(text)
    lemmas = [token.lemma_.lower() for token in doc]
    lemmas = [lemma for lemma in lemmas if lemma.isalpha() and lemma not in filterwords]
    return " ".join(lemmas)

def preprocess(df, to_csv=False, to_pickle=False, verbose=True):
    verboseprint = define_print(verbose=verbose)
    pandarallel.initialize(progress_bar=True)
    filterwords = load_filter()
    nlp = spacy.load("de_core_news_sm")

    verboseprint(f"lemmatizing transcript data of {len(df.index)} videos...")
    df["preprocessed"] = df["transcript"].parallel_apply(
        lemmatize, args=(nlp, filterwords)
    )

    if to_csv:
        df.to_csv("data/preprocessed/" + df.iloc[0]["medium"] + "_preprocessed.csv")

    if to_pickle:
        df.to_pickle("data/preprocessed/" + df.iloc[0]["medium"] + "_preprocessed.pkl")
    return df

In [330]:
failcounter = 0
mediathek_dict = {
    'medium':[],
    'id':[],
    'title':[],
    'description':[],
    'duration':[],
    'date':[],
    'category':[],
    'minute':[],
    'transcript':[],
}

folder_list = os.listdir('../assets/mediathek_subtitles/')
if '.DS_Store' in folder_list:
    folder_list.remove('.DS_Store')

for folder in tqdm(folder_list):
    for txtfile in glob.glob("../assets/mediathek_subtitles/"+folder+"/*.txt"):
        try:
            with open(txtfile) as f:
                txt = f.read()
            subs = pysrt.open(txtfile.replace('.txt', '.srt'))
        except:
            failcounter += 1
            continue
        
        idx = 1
        max_minute = max([subs[i].start.minutes for i in range(len(subs))])
        
        for minute in range(max_minute):
            text = ""
            while subs[idx].start.minutes == minute:
                text += re.sub(clean_pattern, "", subs[idx].text).replace("\n"," ") + " "
                idx += 1
            mediathek_dict['medium'].append(re.findall(channel_pattern, txt)[0])
            mediathek_dict['id'].append(None)
            mediathek_dict['title'].append(re.findall(title_pattern, txt)[0])
            mediathek_dict['description'].append(re.findall(description_pattern, txt)[0].replace("\n", " "))
            mediathek_dict['duration'].append(re.findall(duration_pattern, txt)[0])
            mediathek_dict['date'].append(re.findall(date_pattern, txt)[0])
            mediathek_dict['category'].append("News & Politics")
            mediathek_dict['minute'].append(minute)
            mediathek_dict['transcript'].append(text)

df = pd.DataFrame(mediathek_dict)
print(f"successfully created dataframe with {len(df.index)} minutes of transcript data.")

100%|██████████| 9/9 [00:16<00:00,  1.83s/it]

successfully created dataframe with 24284 minutes of transcript data.





In [331]:
filterwords = load_filter()
df = preprocess(df)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.
lemmatizing transcript data of 24284 videos...


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=3036), Label(value='0 / 3036'))), …

In [332]:
df.to_pickle('../data/mediathek_data.pkl')

In [333]:
aksdhja = pd.read_pickle('../data/mediathek_data.pkl')
aksdhja.head()

Unnamed: 0,medium,id,title,description,duration,date,category,minute,transcript,preprocessed
0,ARD,,studioM - Corona-Demos: Alles Verschwörung?,Monitor-Chef Georg Restle diskutiert in studio...,00:57:08,27.05.2020,News & Politics,0,Hallo und willkommen zur 7. Ausgabe von studio...,willkommen ausgabe befassen protesten fragen d...
1,ARD,,studioM - Corona-Demos: Alles Verschwörung?,Monitor-Chef Georg Restle diskutiert in studio...,00:57:08,27.05.2020,News & Politics,1,oder Verschwörungserzählungen befasst hat. Hal...,verschwörungserzählungen befasst nocun grüße f...
2,ARD,,studioM - Corona-Demos: Alles Verschwörung?,Monitor-Chef Georg Restle diskutiert in studio...,00:57:08,27.05.2020,News & Politics,2,"D.h. es muss möglich sein zu beweisen, dass si...",beweisen falsch wissenschaftler gezeigt theori...
3,ARD,,studioM - Corona-Demos: Alles Verschwörung?,Monitor-Chef Georg Restle diskutiert in studio...,00:57:08,27.05.2020,News & Politics,3,"Wir sehen, dass online sehr viele Gruppen derz...",gruppen proteste mobilisieren erzählungen verb...
4,ARD,,studioM - Corona-Demos: Alles Verschwörung?,Monitor-Chef Georg Restle diskutiert in studio...,00:57:08,27.05.2020,News & Politics,4,"die auf diesen Demos sind, Verschwörungsideolo...",demos verschwörungsideologen demonstrationen a...
