In [21]:
import pysrt
import re
import os
import glob
import pandas as pd
from tqdm import tqdm
import spacy
from pandarallel import pandarallel
from datetime import datetime

clean_pattern = re.compile(r"<font color=\"#[0123456789ABCDEFabcdef]{6}\">|</font>", re.DOTALL)
date_pattern = re.compile(r"Datum:\s+([0123][0123456789].[01][0123456789].20[012][0123456789])(?=\n)", re.DOTALL)
description_pattern = re.compile(r"(?:.m3u8\n\n|.mp4\n\n)(.*)(?=\n\n|\n)", re.DOTALL)
title_pattern = re.compile(r"Titel:\s+(.*)(?=\n\nDatum)", re.DOTALL)
show_pattern = re.compile(r"Thema:\s+(.*)(?=\n\nTitel)", re.DOTALL)
duration_pattern = re.compile(r"Dauer:\s+(.*)(?=\n\nTitel|\n\n\nWebsite|\nGröße)", re.DOTALL)
channel_pattern = re.compile(r"Sender:\s+([AZ][RD][DF](?=\n))", re.DOTALL)

In [22]:
def define_print(verbose=True):
    if verbose:
        verboseprint = print
    else:
        verboseprint = lambda *args: None
    return verboseprint

def load_filter():
    nlp = spacy.load("de_core_news_sm")
    filterwords = spacy.lang.de.stop_words.STOP_WORDS
    with open("../assets/filterwords.txt", encoding="utf-8", errors="ignore") as d:
        filterwords.update(d.read().split())
    with open("../assets/german_stopwords_full.txt", encoding="utf-8", errors="ignore") as d:
        filterwords.update(d.read().split()[53:])
    return list(set(filterwords))

def lemmatize(text, nlp, filterwords):
    """
    tokenizes and lemmatizes german input text
    :param text: raw input text (german)
    :return: list of lemmatized tokens from input text
    """

    with nlp.select_pipes(enable="lemmatizer"):
        doc = nlp(text)
    lemmas = [token.lemma_.lower() for token in doc]
    lemmas = [lemma for lemma in lemmas if lemma.isalpha() and lemma not in filterwords]
    return " ".join(lemmas)

def preprocess(df, to_csv=False, to_pickle=False, verbose=True):
    verboseprint = define_print(verbose=verbose)
    pandarallel.initialize(progress_bar=True)
    filterwords = load_filter()
    nlp = spacy.load("de_core_news_sm")

    verboseprint(f"lemmatizing transcript data of {len(df.index)} videos...")
    df["preprocessed"] = df["transcript"].parallel_apply(
        lemmatize, args=(nlp, filterwords)
    )

    if to_csv:
        df.to_csv("data/preprocessed/" + df.iloc[0]["medium"] + "_preprocessed.csv")

    if to_pickle:
        df.to_pickle("data/preprocessed/" + df.iloc[0]["medium"] + "_preprocessed.pkl")
    return df

def convert_string_to_seconds(string):
    t = datetime.strptime(string, '%H:%M:%S')
    secs = 3600*t.hour + 60*t.minute + t.second
    return secs

In [23]:
failcounter = 0
mediathek_dict = {
    'medium':[],
    'id':[],
    'title':[],
    'description':[],
    'duration':[],
    'date':[],
    'category':[],
    'minute':[],
    'transcript':[],
}

folder_list = os.listdir('../assets/mediathek_subtitles/')
if '.DS_Store' in folder_list:
    folder_list.remove('.DS_Store')

for folder in tqdm(folder_list):
    for txtfile in glob.glob("../assets/mediathek_subtitles/"+folder+"/*.txt"):
        try:
            with open(txtfile) as f:
                txt = f.read()
            subs = pysrt.open(txtfile.replace('.txt', '.srt'))
        except:
            failcounter += 1
            continue
        
        idx = 1
        max_minute = max([subs[i].start.minutes for i in range(len(subs))])
        
        for minute in range(max_minute):
            text = ""
            while subs[idx].start.minutes == minute:
                text += re.sub(clean_pattern, "", subs[idx].text).replace("\n"," ") + " "
                idx += 1
            mediathek_dict['medium'].append(re.findall(channel_pattern, txt)[0])
            mediathek_dict['id'].append(re.findall(show_pattern, txt)[0])
            mediathek_dict['title'].append(re.findall(title_pattern, txt)[0])
            mediathek_dict['description'].append(re.findall(description_pattern, txt)[0].replace("\n", " "))
            mediathek_dict['duration'].append(re.findall(duration_pattern, txt)[0])
            mediathek_dict['date'].append(re.findall(date_pattern, txt)[0])
            mediathek_dict['category'].append("News & Politics")
            mediathek_dict['minute'].append(minute)
            mediathek_dict['transcript'].append(text)

df = pd.DataFrame(mediathek_dict)
print(f"successfully created dataframe with {len(df.index)} minutes of transcript data.")

100%|██████████| 9/9 [00:07<00:00,  1.17it/s]

successfully created dataframe with 24284 minutes of transcript data.





In [24]:
df = preprocess(df)
df['duration'] = df['duration'].apply(convert_string_to_seconds)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.
lemmatizing transcript data of 24284 videos...


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=3036), Label(value='0 / 3036'))), …

In [25]:
df.loc[df['id'] == 'MONITOR studioM', 'id'] = 'Monitor'
df.loc[df['id'] == 'Menschen bei Maischberger', 'id'] = 'Maischberger'
df.loc[df['id'] == 'maischberger. die woche', 'id'] = 'Maischberger'

In [26]:
df.to_pickle('../data/mediathek_data.pkl')
df_mediathek = df

In [27]:
df_youtube = pd.read_pickle('../data/topic.pkl')
df_combined = pd.concat([df_mediathek, df_youtube], ignore_index=True)

In [28]:
test_df = pd.read_pickle('../data/topics_combined_old.pkl')
df_combined.index = test_df.index
df_combined['topic'] = test_df['topic']
df_combined.to_pickle('../data/topics_combined.pkl')

In [29]:
df_mediathek['topic'] = df_combined['topic'].iloc[0:24283]
df_mediathek.to_pickle('../data/mediathek_data.pkl')

In [30]:
df_mediathek['id'].unique()

array(['Monitor', 'frontal', 'Maischberger', 'Tagesthemen',
       'maybrit illner', 'Markus Lanz', 'Frontal 21', 'Hart aber fair',
       'Anne Will'], dtype=object)

In [31]:
df_mediathek.loc[df['medium'] == 'ZDF', 'id'].unique()

array(['frontal', 'maybrit illner', 'Markus Lanz', 'Frontal 21'],
      dtype=object)