In [None]:
import warnings
warnings.filterwarnings("ignore")
from helpers import convert_filename
import json
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from bertopic import BERTopic

In [None]:
RESULTS_DIR = "findings"

In [None]:
stop_words = None
with open("stopwords.txt", "r") as stop_file:
    stop_words = set(stop_file.read().splitlines())

Start Lemmatized Tweets Processing

In [None]:
tweets = []
classes = []
candidates = []
lemm_tweets = None

with open("data/lemm_tweets.json", "r") as in_file:
    lemm_tweets = json.load(in_file)

for candidate, candidate_tweets in lemm_tweets.items():
    candidates.append(candidate)
    for tweet in candidate_tweets:
        non_stop_words = []

        for word in tweet.split():
            if word not in stop_words:
                non_stop_words.append(word)

        if len(non_stop_words) > 0:
            tweets.append(" ".join(non_stop_words))
            classes.append(candidate)

In [None]:
title_font = {"family": "Times", "size": 16}

for cand, cand_tweets in lemm_tweets.items():
    wc = WordCloud(max_words=100, random_state=42, width=800, height=600,
                   stopwords=stop_words, background_color="white", colormap="tab10", min_font_size=6)
    wc.generate(" ".join(cand_tweets))

    plt.figure(figsize=(8, 6))
    plt.title(cand, fontdict=title_font, pad=20)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")

    file_name = convert_filename(cand).lower()
    wc_svg = wc.to_svg(embed_font=True)
    with open(f"{RESULTS_DIR}/wc_{file_name}.svg", "w") as out_file:
        out_file.write(wc_svg)

    plt.show()

In [None]:
topic_model = BERTopic(language="multilingual", verbose=True)
topics, probs = topic_model.fit_transform(tweets)

In [None]:
topic_model.reduce_topics(tweets, nr_topics=24)

topic_labels_lemm = topic_model.generate_topic_labels(
    nr_words=5, topic_prefix=False, separator=", ")

topic_model.set_topic_labels(topic_labels_lemm)
df = topic_model.get_topic_info()
df.to_excel(f"{RESULTS_DIR}/topics.xlsx", index=False)
df.head(25)

In [None]:
fig = topic_model.visualize_barchart(top_n_topics=12, n_words=8, width=300)
fig.write_image(f"{RESULTS_DIR}/topic_word_scores.pdf")
fig.show()

In [None]:
topics_per_class = topic_model.topics_per_class(tweets, classes=classes)
fig = topic_model.visualize_topics_per_class(
    topics_per_class, top_n_topics=12, width=1000, custom_labels=True)
fig.write_html(f"{RESULTS_DIR}/topics_per_class.html")
fig.show()

In [None]:
for candidate in candidates:
    df = topics_per_class.query('Class == "' + candidate + '" and Topic > -1').sort_values(
        by=["Frequency"], ascending=False).loc[:, "Words":"Name"].drop("Class", axis=1).head(12)

    file_name = convert_filename(candidate).lower()
    df.to_excel(f"{RESULTS_DIR}/{file_name}.xlsx", index=False)

    print(candidate)
    print(df.to_string(index=False))
    print("\n\n")

In [None]:
"""
topic_model.visualize_documents(
    tweets, height=600, custom_labels=True, topics=list(range(12)))
"""