In [1]:

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
import pandas as pd

import joblib
import numpy as np

topic_dict = {
    0: 'Tech',
    1: 'Internationale Wahlen',
    2: 'Kunst & Literatur',
    3: 'Wetter & Fußball',
    4: 'Wirtschaft',
    5: 'Justiz',
    6: 'International',
    7: 'Ukrainekonflikt',
    8: 'Familie',
    9: 'Impfung',
    10: 'Interview',
    11: 'Innenpolitik',
    12: 'Wahlen in Deutschland',
    13: 'Parteienpolitik',
    14: 'Coronamaßnahmen',
}

media = [
    'junge Welt',
    "NachDenkSeiten",
    'taz',
    'Süddeutsche Zeitung',
    'stern TV',
    "DER SPIEGEL",
    'Der Tagesspiegel',
    'ARD',
    'Tagesschau',
    'ZDF',
    "ZDFheute Nachrichten",
    'Bayerischer Rundfunk',
    'ntv Nachrichten',
    'RTL',
    'FOCUS Online',
    'ZEIT ONLINE',
    'faz',
    'WELT',
    "BILD",
    'NZZ Neue Zürcher Zeitung',
    "Junge Freiheit",
    'COMPACTTV'
]

def define_print(verbose=True):
    if verbose:
        verboseprint = print
    else:
        verboseprint = lambda *args: None
    return verboseprint


def extract_topics(df, cv_model, lda_model, to_csv=True, verbose=True):
    verboseprint = define_print(verbose=verbose)
    df.dropna(inplace=True)

    verboseprint(f"getting topics for {df.shape[0]} videos...")
    lda = cv_model.transform(df["preprocessed"].to_list())
    lda = lda_model.transform(lda)
    lda = pd.DataFrame(lda)
    lda.rename(columns=topic_dict, inplace=True)
    dominant_topic_list = [topic_dict[topic] for topic in np.argmax(lda.values, axis=1)]
    dominant_topic_mask = np.max(lda.values, axis=1) < 0.3
    lda["dominant topic"] = dominant_topic_list
    lda.loc[dominant_topic_mask, "dominant topic"] = "None"

    # lda['dominant topic'] = lda.apply(lambda row: np.argmax(row.values) if np.max(row.values) > 0.3 else 'None')
    lda["id"] = df["id"].to_list()

    verboseprint("merging data...")
    df = df.merge(lda, how="outer", on="id")
    if to_csv:
        verboseprint("saving csv file...")
        df.to_csv("../data/labeled/" + df["medium"].iloc[0] + "_labeled.csv")
    return df


def sort_topics(dfs, to_csv=True, verbose=True):
    verboseprint = define_print(verbose=verbose)
    dfs_dict = {}

    verboseprint("initializing dataframes...")
    for _, topic in topic_dict.items():
        dfs_dict[topic] = pd.DataFrame()

    dfs_dict["None"] = pd.DataFrame()

    verboseprint(f"iterating through {len(dfs)} input dataframes...")
    for df in dfs:
        verboseprint("sorting " + df["medium"].iloc[0] + " dataframe by topic...")
        for _, topic in topic_dict.items():
            dfs_dict[topic] = pd.concat(
                [dfs_dict[topic], df[df["dominant topic"] == topic]]
            )
        dfs_dict["None"] = pd.concat(
            [dfs_dict["None"], df[df["dominant topic"] == "None"]]
        )

    if to_csv:
        verboseprint("saving csv files...")
        for _, topic in topic_dict.items():
            dfs_dict[topic].to_csv("../data/sorted/" + topic + ".csv")
        dfs_dict["None"].to_csv("../data/sorted/None.csv")

    return dfs_dict


def get_N_matrix(topic, verbose=True, drop_subsumed=True, drop_medium_specific=True):
    verboseprint = define_print(verbose=verbose)
    MEDIA = media
    cv = CountVectorizer(max_df=0.9, min_df=10, max_features=10000, ngram_range=(1, 3))

    verboseprint("importing dataframe with topic " + topic + " and fitting model...")
    df = pd.read_csv("../data/sorted/" + topic + ".csv", index_col=0)
    cv.fit(df["preprocessed"])

    verboseprint("restructuring dataframe with " + str(len(df)) + " transcripts...")
    df["preprocessed"] = df["preprocessed"] + " "
    df = df[["medium", "preprocessed", "dominant topic"]]
    df_grouped = df.groupby(["medium", "dominant topic"]).sum()

    df = pd.DataFrame(index=MEDIA, columns=["preprocessed"])
    empty_media = []
    for medium in MEDIA:
        try:
            df.loc[medium] = df_grouped.loc[medium].loc[topic]["preprocessed"]
        except:
            print(
                medium
                + " does not have any videos categorized under category '"
                + topic
                + "'."
            )
            df.drop(index=medium, inplace=True)
            empty_media.append(medium)

    for empty_medium in empty_media:
        MEDIA.remove(empty_medium)

    verboseprint("counting n-gram occurences...")
    N_matrix = cv.transform(df["preprocessed"].values)
    N_df = pd.DataFrame(
        data=N_matrix.toarray().transpose(),
        columns=df.index,
        index=cv.get_feature_names_out(),
    )

    if drop_medium_specific:
        verboseprint(
            "dropping medium-specific n-grams that occur in one medium at least 90% of the time..."
        )
        N_sum = N_df.sum(axis=1)
        mask = {}
        specific_mask = np.full(len(N_df.index), False)
        mask_df = N_df.apply(lambda x: x>0.9*N_sum)
        for medium in MEDIA:
            specific_mask = specific_mask | mask_df[medium].values
        N_df.drop(N_df.index[specific_mask], inplace=True)

    if drop_subsumed:
        N_df = N_df.reset_index().rename(columns={"index": "phrase"})
        N_df["n_gram"] = N_df["phrase"].apply(str.split).apply(len)
        N_df["count"] = N_df[MEDIA].sum(axis=1)

        monograms = N_df[N_df["n_gram"] == 1]
        bigrams = N_df[N_df["n_gram"] == 2]
        trigrams = N_df[N_df["n_gram"] == 3]
        bigram_words = list(
            set(
                [
                    word
                    for bigram_sublist in bigrams["phrase"].apply(str.split).tolist()
                    for word in bigram_sublist
                ]
            )
        )
        trigram_words = list(
            set(
                [
                    word
                    for trigram_sublist in trigrams["phrase"].apply(str.split).tolist()
                    for word in trigram_sublist
                ]
            )
        )

        verboseprint("extracting subsumed n-grams...")
        monograms_in_bigrams = monograms[monograms["phrase"].isin(bigram_words)]
        monograms_in_trigrams = monograms[monograms["phrase"].isin(trigram_words)]

        bigrams_in_trigrams_words = list(
            set(
                [
                    bigram_word
                    for bigram_word in bigram_words
                    if bigram_word in trigram_words
                ]
            )
        )
        bigrams_in_trigrams_mask = bigrams["phrase"].apply(
            lambda bigram: True
            if bigram.split()[0] in bigrams_in_trigrams_words
            or bigram.split()[1] in bigrams_in_trigrams_words
            else False
        )
        bigrams_in_trigrams = bigrams[bigrams_in_trigrams_mask]

        threshold = 0.7
        verboseprint(
            f"filtering n-grams which are subsumed more than {int(100*threshold)}% of the time..."
        )
        monograms_in_bigrams_above_threshold = list(
            set(
                [
                    monogram["phrase"]
                    for _, monogram in monograms_in_bigrams.iterrows()
                    for _, bigram in bigrams.iterrows()
                    if monogram["phrase"] in bigram["phrase"].split()
                    and bigram["count"] > threshold * monogram["count"]
                ]
            )
        )
        monograms_in_trigrams_above_threshold = list(
            set(
                [
                    monogram["phrase"]
                    for _, monogram in monograms_in_trigrams.iterrows()
                    for _, trigram in trigrams.iterrows()
                    if monogram["phrase"] in trigram["phrase"].split()
                    and trigram["count"] > threshold * monogram["count"]
                ]
            )
        )
        bigrams_in_trigrams_above_threshold = list(
            set(
                [
                    bigram["phrase"]
                    for _, bigram in bigrams_in_trigrams.iterrows()
                    for _, trigram in trigrams.iterrows()
                    if (
                        bigram["phrase"] in " ".join(trigram["phrase"].split()[:2])
                        or bigram["phrase"] in " ".join(trigram["phrase"].split()[-2:])
                    )
                    and trigram["count"] > threshold * bigram["count"]
                ]
            )
        )
        n_grams_above_threshold = list(
            set(
                np.append(
                    np.append(
                        monograms_in_bigrams_above_threshold,
                        monograms_in_trigrams_above_threshold,
                    ),
                    bigrams_in_trigrams_above_threshold,
                )
            )
        )

        N_df.drop(
            N_df[N_df["phrase"].isin(n_grams_above_threshold)].index, inplace=True
        )
        N_df.set_index("phrase", inplace=True)
        N_df.drop(columns=["n_gram", "count"], inplace=True)
    return N_df


def filter_N_by_information_score(N_df, n=1000, verbose=True):
    verboseprint = define_print(verbose=verbose)
    verboseprint("filtering " + str(n) + " most discriminative phrases from sample...")
    n_i = len(N_df.index)
    n_j = len(N_df.columns)
    P_ij = N_df / N_df.to_numpy().sum()
    P_i = P_ij.sum(axis=1)
    P_j = P_ij.sum(axis=0)

    I = np.zeros((n_i, n_j))

    for i in range(n_i):
        for j in range(n_j):
            I[i][j] = P_ij.values[i][j] * np.log2(P_ij.values[i][j] / P_i[i] / P_j[j])

    I = pd.DataFrame(I, index=N_df.index, columns=N_df.columns)
    I = I.fillna(0.0)
    I["sum"] = I.sum(axis=1)
    I.sort_values(by="sum", ascending=False, inplace=True)
    return N_df.loc[I.index[:n]]


In [2]:
topic_info = pd.read_pickle('topic_info.pkl')
topic = topic_info.iloc[1]['Name']
print(topic)

N_df = pd.read_pickle(f'../data/N_matrices_pre_purge/N_{topic}.pkl')
#N_df = get_N_matrix(topic=topic)
N_df = filter_N_by_information_score(N_df)

#N = pd.read_csv('../data/N_matrices_pre_purge/N_'+topic+'_pre_purge.csv', index_col=0)

0_ukraine_russland_russischen_putin
filtering 1000 most discriminative phrases from sample...


  I[i][j] = P_ij.values[i][j] * np.log2(P_ij.values[i][j] / P_i[i] / P_j[j])
  I[i][j] = P_ij.values[i][j] * np.log2(P_ij.values[i][j] / P_i[i] / P_j[j])


In [3]:
blacklist = [
    'the',
    'to',
    'that',
    'of',
    'uh',
    'we',
    'is',
    'you',
    'this',
    'peter',
    'are',
    'it',
    'have',
    'like',
    'with',
    'on',
    'our',
    'but',
    'what',
    'they',
    'be',
    'sozusagen',
    'from',
    'people',
    'not',
    'philipp',
    'as',
    'there',
    'because',
    'of the',
    'can',
    'has',
    'about',
    'or',
    'julian',
    'now',
    'very',
    'do',
    'who',
    'think',
    'at',
    'if',
    'by',
    'their',
    'my',
    'just',
    'yeah',
    'would',
    'this is',
    'more',
    'we have',
    'he',
    'when',
    'we are',
    'to the',
    'vitali',
    'were',
    'katrin',
    'how',
    'going',
    'thank you',
    'support',
    'on the',
    'them',
    'mr',
    'then',
    'that we',
    'one',
    'which',
    'say',
    'other',
    'united',
    'time',
    'these',
    'you know',
    'against',
    'some',
    'many',
    'glaube',
    'today',
    'its',
    'any',
    'right',
    'to be',
    'even',
    'said',
    'something',
    'why',
    'that the',
    'could',
    'your',
    'well',
    'world',
    'go',
    'albrecht',
    'go',
    'from the',
    'really',
    'course',
    'country',
    'going to',
    'here',
    'into',
    'no',
    'with the',
    'want',
    'only',
    'had',
    'those',
    'his',
    'it is',
    'is the',
    'lot',
    'way',
    'clear',
    'should',
    'much',
    'me',
    'theo',
    'continue',
    'need',
    'where',
    'question',
    'button',
    'montag',
    'dienstag',
    'mittwoch',
    'donnerstag',
    'freitag',
    'samstag',
    'sonntag',
    'norbert',
    'okay',
    'over',
    'up',
    'than',
    'again',
    'out',
    'axel',
    'make',
    'abonnieren',
    'to do',
    'get',
    'schreibt',
    'exklusiv',
    'most',
    'denke',
    'see',
    'take',
    'first',
    'januar',
    'februar',
    'märz',
    'april',
    'mai',
    'juni',
    'juli',
    'august',
    'september',
    'oktober',
    'november',
    'dezember',
    'come',
    'tobias',
    'florian',
    'part',
    'gerne',
    'jump',
    'gucken',
    'two',
    'made',
    'barbara',
    'wochen',
    'film',
    'back',
    'nadja',
    'name',
    'gabriel',
    'vorhin',
    'use',
    'sebastian',
    'leuten',
    'vater',
    'andreas',
    'stefan',
    'life',
    'johnny depp',
    'stop',
    'heinz',
    'helmut',
    'mutter',
    'fußball',
    'rainer',
    'community',
    'claudia',
    'instagram',
    'response',
    'jens',
    'day',
    'know',
]

In [4]:
N = N_df[~N_df.index.isin(blacklist)]
N.to_pickle('../data/N_matrices_post_purge/N_'+topic+'_post_purge.pkl')
len(N.columns)

21

In [5]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib qt

scaler = StandardScaler()
model = TruncatedSVD(n_components=3)

N_scaled = scaler.fit_transform(N.values)
N_df_trunc = model.fit_transform(N_scaled)

sns.set(palette="coolwarm", style='whitegrid')
sns.scatterplot(
    model.components_[0], 
    model.components_[1], 
    hue=N_df.columns,
    palette='coolwarm',
    ).set(title=f'Thema "{topic}" - Hauptachsen 0 und 1')
plt.figure()
sns.scatterplot(
    model.components_[1], 
    model.components_[2], 
    hue=N_df.columns,
    palette='coolwarm',
    ).set(title=f'Thema "{topic}" - Hauptachsen 1 und 2')
plt.show()



In [6]:
political_spectrum_dict = {
    'junge Welt':'left',
    'NachDenkSeiten':'left',
    'taz':'left leaning',
    'Süddeutsche Zeitung':'center',
    'stern TV':'left leaning',
    'DER SPIEGEL':'left leaning',
    'Der Tagesspiegel':'center',
    'ARD':'center',
    'ZDF':'center',
    'ZDFheute Nachrichten':'center',
    'Bayerischer Rundfunk':'center',
    'ntv Nachrichten':'center',
    'RTL':'right leaning',
    'FOCUS Online':'right leaning',
    'ZEIT ONLINE':'center',
    'faz':'center',
    'WELT':'center',
    'BILD':'right leaning',
    'NZZ Neue Zürcher Zeitung':'right leaning',
    'Junge Freiheit':'right',
    'COMPACTTV':'right'
}

In [14]:
import plotly.express as px
import matplotlib.pyplot as plt
%matplotlib qt

n_components=3
scale = False

scaler = StandardScaler()
model = TruncatedSVD(n_components=n_components)

if scale:
    N_scaled = scaler.fit_transform(N.values)
    N_df_trunc = model.fit_transform(N_scaled)
else:
    N_df_trunc = model.fit_transform(N.values)

for n in range(n_components):
    fig = px.scatter(x=model.components_[n], y=[0.0 for test in model.components_[n]], color=political_spectrum_dict.values(), hover_data=[N_df.columns], title=f'{n}. Principal Component')
    fig.show()