IMPORTS

In [2]:
from youtube_transcript_api import YouTubeTranscriptApi
from youtubesearchpython import *
import pandas as pd
import numpy as np
import spacy
import glob
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

SCRAPING

In [None]:
def getTranscript(video_id):
    '''
    Downloads Transcript from YouTube video and returns it in a DataFrame.
    :param video_id: String of YouTube video ID
    :return: Pandas DataFrame of Transcript
    '''
    transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['de', 'en'])
    transcript_df = pd.DataFrame(transcript)
    return transcript_df

### initializing DataFrame of youtube channels to scrape captions data from
channels_dict = {'name':['NachDenkSeiten', 'Spiegel', 'ZDFheute', 'BILD', 'Junge Freiheit'],
                'id':['UCE7b8qctaEGmST38-sfdOsA', 'UC1w6pNGiiLdZgyNpXUnA4Zw', 'UCeqKIgPQfNInOswGRWt48kQ', 'UC4zcMHyrT_xyWlgy5WGpFFQ', 'UCXJBRgiZRZvfilIGQ4wN5CQ']}
channels_df = pd.DataFrame(channels_dict)

### iterating over channels in DataFrame
for index, channel in channels_df.iterrows():
    print('getting info on youtube channel ' + channel['name'] + '...')
    playlist = Playlist(playlist_from_channel_id(channel['id']))

    ### retrieving video ids from channel id
    while playlist.hasMoreVideos:
        try:
            playlist.getNextVideos()
        except:
            pass
    print(f'videos retrieved: {len(playlist.videos)}')

    ### getting transcripts from video id
    transcript_dict = {'id':[], 'transcript':[]}
    for video in range(len(playlist.videos)):
        text = ''
        print('getting transcript of video number ' + str(video) + ' with id ' + playlist.videos[video]['id'])
        try:
            captions = getTranscript(playlist.videos[video]['id'])
            captions = captions[captions['start'] >= 5.0] #start getting captions after 5s
            for line in captions['text'] :
                text += line + ' '
            transcript_dict['id'].append(playlist.videos[video]['id'])
            transcript_dict['transcript'].append(text)
        except:
            print('could not get transcript for video number ' + str(video) + ' with id ' + playlist.videos[video]['id'])

    ### converting to dataframe and saving as csv
    transcript_df = pd.DataFrame(transcript_dict)
    print(transcript_df.head())
    transcript_df.to_csv('data\\raw\\'+ channel['name'] + '.csv')

PREPROCESSING

In [None]:
def preprocess(text):
    '''
    tokenizes and lemmatizes german input text
    :param text: raw input text (german)
    :return: list of lemmatized tokens from input text
    '''
    doc = nlp(str(text))
    lemmas_tmp = [token.lemma_.lower() for token in doc]
    lemmas = [lemma for lemma in lemmas_tmp if lemma.isalpha() and lemma not in filterwords]
    return ' '.join(lemmas)

### initializing spacy with german language
nlp = spacy.load('de_core_news_sm')
filterwords = spacy.lang.de.stop_words.STOP_WORDS
filterwords.update(['musik', 'music', 'applause', 'applaus', 'tv',
                    'bild',
                    'spiegel',
                    'nachdenkseiten',
                    'junge freiheit', 'j ftv' , 'jfv', 'fjt v', 'jftv',
                    'zdf', 'claus kleber'])

### looping through input files
path = 'data/raw/*.csv'
for csv in glob.glob(path):
    ### importing data
    df = pd.read_csv(csv, index_col=0)

    ### preprocess transcript data
    df['preprocessed'] = df['transcript'].apply(preprocess)
    df.to_csv(csv.replace('data\\','data\\preprocessed\\').replace('.csv','_preprocessed.csv'))


### load data
n_samples = [10, 50, 100, 300]
rng_seed = 42
path = 'data\\preprocessed/*.csv'
for i, k in enumerate(n_samples):
    data = pd.DataFrame()
    for csv in glob.glob(path):
        tmp = pd.read_csv(csv, index_col=0)
        tmp = tmp.sample(n=k, random_state=rng_seed)
        tmp['medium'] = csv.replace('data\\preprocessed\\', '').replace('_preprocessed.csv', '')
        data = pd.concat([data, tmp])
    data = shuffle(data, random_state=rng_seed).astype(str)
    data.to_csv('data\\samples\\sample'+str(k)+'.csv')
    del data

VECTORIZING

In [None]:
rng_seed = 42

### importing data
df = pd.read_csv('data/samples/sample10.csv')
data = df.groupby(['medium'])['preprocessed'].sum()
data = data.loc[['NachDenkSeiten', 'Spiegel', 'ZDFheute', 'BILD', 'Junge Freiheit']]
X = data.values
y = data.index


### instantiating vectorizers
cv = CountVectorizer(ngram_range=(1,3))
X_cv = cv.fit_transform(X)

tfidf = TfidfVectorizer(ngram_range=(1,3))
X_tfidf = tfidf.fit_transform(X)

nlp = spacy.load('de_core_news_sm')
X_we = [nlp(x).vector for x in X]

hv = HashingVectorizer(ngram_range=(1,3))
X_hv = hv.fit_transform(X)


### pca for dimension reduction
pca_cv = TruncatedSVD(n_components=2)
features_cv = pca_cv.fit_transform(X_cv)
xs_cv = features_cv[:,0]
ys_cv = features_cv[:,1]

pca_tfidf = TruncatedSVD(n_components=2)
features_cv = pca_tfidf.fit_transform(X_tfidf)
xs_tfidf = features_cv[:,0]
ys_tfidf = features_cv[:,1]

pca_we = TruncatedSVD(n_components=2)
features_we = pca_we.fit_transform(X_we)
xs_we = features_we[:,0]
ys_we = features_we[:,1]

pca_hv = TruncatedSVD(n_components=2)
features_hv = pca_hv.fit_transform((X_hv))
xs_hv = features_hv[:,0]
ys_hv = features_hv[:,1]


### plots
sns.set()
sns.set_style('darkgrid')
fig, axs = plt.subplots(nrows=2, ncols=2)
sns.scatterplot(x=xs_cv, y=ys_cv, hue=y, ax=axs[0, 0], palette='RdBu').set(title='Count Vectorizer')
sns.scatterplot(x=xs_tfidf, y=ys_tfidf, hue=y, ax=axs[0, 1], palette='RdBu').set(title='TFIDF Vectorizer')
sns.scatterplot(x=xs_we, y=ys_we, hue=y, ax=axs[1, 0], palette='RdBu').set(title='Spacy Word Embeddings Vectorizer')
sns.scatterplot(x=xs_hv, y=ys_hv, hue=y, ax=axs[1, 1], palette='RdBu').set(title='Hashing Vectorizer')
plt.show()

Tf-Idf-Analysis

In [4]:
n = 100
def get_top_tf_idf_words(response, top_n=n):
    sorted_nzs = np.argsort(response.data)[:-(top_n+1):-1]
    return feature_names[response.indices[sorted_nzs]]

### importing data
df = pd.read_csv('data/samples/sample300.csv')
data = df.groupby(['medium'])['preprocessed'].sum()
data = data.loc[['NachDenkSeiten', 'Spiegel', 'ZDFheute', 'BILD', 'Junge Freiheit']]
features = data.values
tfidf = TfidfVectorizer(ngram_range=(1,3))
X = tfidf.fit_transform(df['preprocessed'].dropna())
y = data.index
feature_names = np.array(tfidf.get_feature_names_out())

for idx, feature in enumerate(data):
    responses = tfidf.transform([features[idx]])
    print([get_top_tf_idf_words(response, n) for response in responses])

[array(['zitat', 'medien', 'usa', 'ander', 'geben', 'us', 'russland',
       'deutsch', 'politik', 'deutschland', 'prozent', 'menschen',
       'artikel', 'zahlreiche', 'sagen', 'nato', 'leser', 'all',
       'positiv', 'fragen', 'unterstützen', 'sanktionen', 'website',
       'unterstützung', 'entstehen', 'fördern', 'spenden', 'hören',
       'leserinnen', 'youtube', 'bewertung', 'assange', 'politisch',
       'sehen', 'unternehmen', 'entstehen zahlreiche',
       'entsprechend button', 'weiterverbreitung', 'button', 'soundcloud',
       'audio', 'positiv bewertung', 'audio entstehen',
       'klicken einfach entsprechend', 'itunes soundcloud',
       'button website itunes', 'einfach entsprechend button',
       'website itunes', 'website itunes soundcloud', 'abgeben freuen',
       'button website', 'entsprechend button website',
       'bewertung abgeben freuen', 'positiv bewertung abgeben',
       'unterstützen klicken', 'unterstützen klicken einfach',
       'audio entstehen zahl

CLASSIFYING

In [None]:
### choose which media to evaluate
# OPTIONS: NachDenkSeiten, Spiegel, BILD, Junge Freiheit
first = 'Spiegel'
second = 'Junge Freiheit'

### set visualization parameters
sns.set(style='darkgrid')
sns.set_palette('viridis_r')

### load data
seed = 42 #rng seed
n = 300 #number of data points per medium
k = 10 #number of cross validation folds
df_first = pd.read_csv('data/' + first + '_preprocessed.csv', index_col=0)
df_second = pd.read_csv('data/' + second + '_preprocessed.csv', index_col=0)

### sample and shuffle data
first_sample = df_first.sample(n=n, random_state=seed)
first_sample['label'] = first
second_sample = df_second.sample(n=n, random_state=seed)
second_sample['label'] = second
df = pd.concat([first_sample, second_sample])
df = shuffle(df, random_state=seed).astype(str)
print(df.head())

### split data into train and test parts
X_train, X_test, y_train, y_test = train_test_split(df['preprocessed'], df['label'], test_size=0.2, random_state=seed)

### vectorize data and fit and transform model
vectorizer = TfidfVectorizer(ngram_range=(1,3))
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

### instantiate classifier model and fit to training data
clf = MultinomialNB()
clf.fit(X_train_bow, y_train)
y_pred = clf.predict(X_test_bow)

### calculate classification accuracy with k-fold cross validation
scores = cross_val_score(clf, X_test_bow, y_test, cv=k)
print('cross validation scores:\n' + str(scores))
print('mean cv score: ' + str(scores.mean()))
print('confusion matrix:\n' + str(confusion_matrix(y_test, y_pred)))

### data visualization utilizing pca
pca = TruncatedSVD(n_components=2)
features = pca.fit_transform(X_train_bow)
xs = features[:,0]
ys = features[:,1]
sns.scatterplot(x=xs, y=ys, hue=y_train, alpha=0.5, palette='viridis_r')
plt.show()
