In [2]:
import pandas as pd
from nltk.stem.cistem import Cistem
import re
from nltk.probability import FreqDist
from tabulate import tabulate
from tqdm import tqdm

# load fancy nlp pipeline :)))
import spacy

# WE NEED SPEED :(
# spacy.require_gpu()

nlp = spacy.load("de_core_news_lg")

# Load data

In [None]:
data = pd.read_csv('../data/tagesschau_articles_unique.csv', sep='\t', engine='python', on_bad_lines='warn')
data.head()

/home/simon/Desktop/uni/data_literacy/tagesschau-language-change/src


Unnamed: 0,date,headline,short_headline,short_text,article,link
0,2025-10-23,++ Kiew: Übergabe von 1.000 Leichen durch Russ...,Krieg gegen die Ukraine,\n Ukrainische Behö...,Ukrainische Behörden haben die Übergabe von 1....,/newsticker/liveblog-ukraine-donnerstag-512.html
1,2025-10-23,Zehntausende bei Demonstrationen in Ungarn,Wahlkampf-Auftakt,\n Im Frühjahr wähl...,Im Frühjahr wählt Ungarn ein neues Parlament. ...,/ausland/europa/ungarn-proteste-132.html
2,2025-10-23,Litauen meldet Eindringen von russischen Flugz...,Top-Thema,\n Zwei russische M...,Zwei russische Militärflugzeuge sind nach Anga...,/eilmeldung/eilmeldung-8954.html
3,2025-10-23,"""Gemeinsam russische U-Boote jagen""",Deutsch-britische Zusammenarbeit,\n Großbritannien u...,Großbritannien und Deutschland wollen gemeinsa...,/ausland/europa/deutschland-grossbritannien-se...
4,2025-10-23,Wetterlage und Temperaturen,Wettervorhersage Europa,\n Sturmtief JOSHUA...,Sturmtief JOSHUA zieht bis Freitagabend mit se...,/wetter/europa-welt


In [None]:
# convert date column to pandas date format
data['date'] = pd.to_datetime(data['date'])

# Some of the articles are missing nan, because its only a video link
drop_indices = data[data['article'].isnull()].index
data = data.drop(index=drop_indices).reset_index(drop=True)

per_year = data.groupby(data['date'].dt.year)

counts_per_year = per_year.size()
print(counts_per_year)

In [None]:
per_year.get_group(2023).head()

In [None]:
cistem = Cistem()

def remove_temporary(msg: str) -> str:
    text = re.sub(r'[^a-zA-Z ]', ' ', msg)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def stem(msg: str) -> str:
    words = msg.split(' ')
    singles = [cistem.stem(word) for word in words]
    return ' '.join(singles)

def get_words_of_interest(text, types=["VERB", "ADJ", "AUX"]):
    doc = nlp(text)
    
    result = {}
    
    for type in types:
        result[type] = [remove_temporary(stem(token.lemma_)) for token in doc if token.pos_ == type]
    
    return result

get_words_of_interest(data['article'][0])

In [None]:
# Convert all articles
frequencies_per_year = {}

for group, entry in per_year:
    articles = entry['article']  
    
    verbs = []
    adj = []
    aux = []
    
    articles = tqdm(articles, desc=f"Processing year {group}")
    for article in articles:
        words = get_words_of_interest(article)
        
        verbs.extend(words['VERB'])
        adj.extend(words['ADJ'])
        aux.extend(words['AUX'])

    freq_verbs = FreqDist(verbs)
    freq_adj = FreqDist(adj)
    freq_aux = FreqDist(aux)
    
    frequencies_per_year[group] = {
        'VERB': freq_verbs,
        'ADJ': freq_adj,
        'AUX': freq_aux}

In [None]:
print("Top frequencies per year:")
for year, freq in frequencies_per_year.items():
    verbs = freq['VERB']
    adj = freq['ADJ']
    aux = freq['AUX']
    
    print(f"Year {year}:")
    
    number = 20
    verbs_head = verbs.most_common(number)
    
    adj_head = adj.most_common(number)
    aux_head = aux.most_common(number)
    
    df_verbs = pd.DataFrame(verbs_head, columns=['Verb', 'Frequency'])
    print(tabulate(df_verbs, headers='keys', tablefmt='github', showindex=False))

    df_adj = pd.DataFrame(adj_head, columns=['Adj', 'Frequency'])
    print(tabulate(df_adj, headers='keys', tablefmt='github', showindex=False))

    df_aux = pd.DataFrame(aux_head, columns=['Aux', 'Frequency'])
    print(tabulate(df_aux, headers='keys', tablefmt='github', showindex=False))