In [12]:
# imports
import pandas as pd
import plotly.express as px


topics = pd.read_pickle("data/topics.pkl") # load data

# build god strings to search in
quarter_topic_god_strings = []
for quarter in topics["quarter"].unique():
    quarter_topics = topics[topics["quarter"] == quarter]["topic"].to_list()
    quarter_topic_god_strings.append([quarter, " ".join(quarter_topics)])


def count_word_occurrences(word: str) -> pd.DataFrame:
    """Counts occurences of given word in every quarter and returns dataframe object"""
    global quarter_topic_god_strings
    word = word.lower()
    result = []
    for quarter, god_string in quarter_topic_god_strings:
        occurrences = god_string.lower().count(word)
        result.append([quarter, occurrences])
    return pd.DataFrame(result, columns=["quarter", "occurrences"])

def plot_word_occurrences(word: str, year: int = None, save: bool = False):
    """Plots occurence of given word per Quarter, if year is set, also prints the articles of that year """
    counts = count_word_occurrences(word)

    if year is not None:
        year_topics = topics[topics["year"] == year]
        word_topics_that_year = year_topics[year_topics["topic"].str.contains(word, case=False)]["topic"].str.strip()
        for t in word_topics_that_year:
            print(t)

    fig = px.line(
        counts,
        title = f"Mentions of word \"{word.capitalize()}\" per quarter",
        x = "quarter", 
        y = "occurrences", 
        labels = {"occurrences": "Number of Mentions", "quarter": "Year", },
        width = 1000,
        height = 500
    )

    x_labels = [" " if not label.endswith("1") else label.split("/")[0] for label in topics["quarter"].unique()]

    fig.update_layout(
        xaxis = dict(
            tickmode = 'array',
            tickvals = list(range(len(x_labels))),
            ticktext = x_labels
        )
    )
    fig.show()

    if save:
        fig.write_html(f"plots/as_html/{word}_occurence.html") # save as html



In [13]:
plot_word_occurrences("bundestagswahl", save=True)