# 1. Dataset Check

In [24]:
# Libraries
import pandas as pd
from google.colab import drive
import datetime

In [25]:
# Connect to Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
# read dataset
data = pd.read_csv('/content/drive/MyDrive/MasterThesis/Reddit_Data/cleaned_trump.csv')
data.head()

Unnamed: 0,thread_title,thread_time_utc,thread_score,comment_time_utc,comment_score,comment_text,subreddit,full_text
0,crypto meme coin trading 2025,2025-01-17 11:06:53,0,2025-01-18 05:13:47,1,i mean i have a group that has 1520 ppl in it ...,therace,crypto meme coin trading 2025 i mean i have a ...
1,crypto meme coin trading 2025,2025-01-17 11:06:53,0,2025-01-17 18:16:41,1,wow what an inspiring journey its super cool t...,therace,crypto meme coin trading 2025 wow what an insp...
2,crypto meme coin trading 2025,2025-01-17 11:06:53,0,2025-01-17 15:22:42,1,ya,therace,crypto meme coin trading 2025 ya
3,crypto meme coin trading 2025,2025-01-17 11:06:53,0,2025-01-17 14:42:22,1,ok fine drop the trading group about to be a c...,therace,crypto meme coin trading 2025 ok fine drop the...
4,crypto meme coin trading 2025,2025-01-17 11:06:53,0,2025-01-17 11:06:54,1,copy real trades on the free afterhour app fro...,therace,crypto meme coin trading 2025 copy real trades...


In [27]:
data.shape

(64181, 8)

In [28]:
# Droping the data aqfter 28.02.2025 to align with the blockchain dat aanalysis
data['comment_time_utc'] = pd.to_datetime(data['comment_time_utc'])
cutoff = pd.to_datetime('2025-02-28')   # no utc=True
data.drop(data[data['comment_time_utc'] > cutoff].index, inplace=True)

In [29]:
data.shape

(56775, 8)

In [30]:
data.nunique()

Unnamed: 0,0
thread_title,354
thread_time_utc,371
thread_score,174
comment_time_utc,54018
comment_score,500
comment_text,51890
subreddit,11
full_text,53782


In [31]:
# dropping duplicates in order to avoid using bots comments or removed comments from analysis
duplicates = data[data.duplicated(subset='comment_text', keep=False)]
duplicates

Unnamed: 0,thread_title,thread_time_utc,thread_score,comment_time_utc,comment_score,comment_text,subreddit,full_text
2,crypto meme coin trading 2025,2025-01-17 11:06:53,0,2025-01-17 15:22:42,1,ya,therace,crypto meme coin trading 2025 ya
4,crypto meme coin trading 2025,2025-01-17 11:06:53,0,2025-01-17 11:06:54,1,copy real trades on the free afterhour app fro...,therace,crypto meme coin trading 2025 copy real trades...
50,trump plans to designate crypto as a national ...,2025-01-17 13:59:58,594,2025-01-17 17:13:29,30,deleted,CryptoCurrency,trump plans to designate crypto as a national ...
60,trump plans to designate crypto as a national ...,2025-01-17 13:59:58,594,2025-01-17 14:36:44,0,greetings elarkspur your comment contained a l...,CryptoCurrency,trump plans to designate crypto as a national ...
69,trump plans to designate crypto as a national ...,2025-01-17 13:59:58,594,2025-01-17 14:36:42,1,removed,CryptoCurrency,trump plans to designate crypto as a national ...
...,...,...,...,...,...,...,...,...
57046,why would trump or any american president be p...,2025-02-27 20:08:46,51,2025-02-27 21:55:01,0,deleted,CryptoMarkets,why would trump or any american president be p...
57064,why would trump or any american president be p...,2025-02-27 20:08:46,51,2025-02-27 21:12:20,2,removed,CryptoMarkets,why would trump or any american president be p...
57133,why would trump or any american president be p...,2025-02-27 20:08:46,51,2025-02-27 20:37:39,0,corruption,CryptoMarkets,why would trump or any american president be p...
57170,why would trump or any american president be p...,2025-02-27 20:08:46,51,2025-02-27 21:03:54,4,removed,CryptoMarkets,why would trump or any american president be p...


In [32]:
data = data.drop_duplicates(subset='comment_text', keep='first')


In [33]:
data.shape

(51891, 8)

In [34]:
data = data[data["comment_text"].str.split().str.len() > 4]

In [35]:
data.shape

(45654, 8)

# 2. Topic Modelling

In [36]:
!pip install bertopic[visualization] umap-learn hdbscan




In [37]:

# Import libraries
import pandas as pd
import umap
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN
import nltk
import re
from nltk.corpus import stopwords
import spacy

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
nlp = spacy.load("en_core_web_sm")

# Text cleaning + lemmatization function
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www.\S+", "", text)
    text = re.sub(r"[^a-zA-Z0-9\s$]", "", text)
    text = re.sub(r"\s+", " ", text).strip()

    # Lemmatize and remove stopwords
    doc = nlp(text)
    words = [token.lemma_ for token in doc if token.lemma_ not in stop_words and not token.is_punct and not token.is_space]

    cleaned_text = " ".join(words)
    return cleaned_text


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [38]:
# Load cleaned dataset from previous part
df = data

# Cleaning using the prepared function
df["clean_comment_text"] = df["comment_text"].dropna().apply(clean_text)
texts = df["clean_comment_text"].dropna().tolist()

# HDBSCAN
hdbscan_model = HDBSCAN(
    min_cluster_size=25,
    min_samples=1,
    prediction_data=True
)

# UMAP model
umap_model = umap.UMAP(
    n_neighbors=20,
    n_components=5,
    min_dist=0.0,
    metric='cosine',
    random_state=42
)

# Embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Create BERTopic model
topic_model = BERTopic(
    hdbscan_model=hdbscan_model,
    umap_model=umap_model,
    embedding_model=embedding_model,
    calculate_probabilities=True,
    language="english",
    verbose=True
)

topics, probs = topic_model.fit_transform(texts)

2025-05-14 14:01:30,860 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/1427 [00:00<?, ?it/s]

2025-05-14 14:01:48,746 - BERTopic - Embedding - Completed ✓
2025-05-14 14:01:48,750 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-14 14:02:58,069 - BERTopic - Dimensionality - Completed ✓
2025-05-14 14:02:58,073 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-14 14:08:35,763 - BERTopic - Cluster - Completed ✓
2025-05-14 14:08:35,778 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-14 14:08:36,561 - BERTopic - Representation - Completed ✓


In [39]:
# After checking different options, we needed to reduce the number of topics. Based on the topic visualiations numer 8 seemed the most optimal
topic_model.reduce_topics(texts, nr_topics=10)

2025-05-14 14:08:37,978 - BERTopic - Topic reduction - Reducing number of topics
2025-05-14 14:08:38,048 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-14 14:08:38,525 - BERTopic - Representation - Completed ✓
2025-05-14 14:08:38,531 - BERTopic - Topic reduction - Reduced number of topics from 349 to 10


<bertopic._bertopic.BERTopic at 0x7e9cc1e28810>

# 3. Visualization and model evaluation

In [40]:
topic_model.visualize_topics()


In [41]:
# Visualize hierarchy of topics
topic_model.visualize_hierarchy()


In [42]:
# Visualize the top topics
topic_model.visualize_barchart(top_n_topics=10)


In [43]:
trump_topic = topic_model.get_topic_info()
trump_topic

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,18954,-1_coin_trump_crypto_people,"[coin, trump, crypto, people, go, get, make, l...",[read early post read fool tell people go mort...
1,0,19297,0_trump_coin_buy_crypto,"[trump, coin, buy, crypto, money, go, get, mak...",[meme market use anymore melania coin totally ...
2,1,4255,1_go_bag_get_think,"[go, bag, get, think, people, ha, like, know, ...","[go say shit, people always think bag go pump ..."
3,2,2750,2_vote_people_biden_trump,"[vote, people, biden, trump, would, get, like,...","[people vote trump, ok I understand get third ..."
4,3,148,3_whale_kraken_boat_miss,"[whale, kraken, boat, miss, pole, touch, buy, ...","[one kraken I buy, lol whale, kraken I get]"
5,4,133,4_comment_link_moderator_please,"[comment, link, moderator, please, telegram, c...",[greeting zealousidealcheck250 comment contain...
6,5,36,5_toshi_xcn_sokka_haiku,"[toshi, xcn, sokka, haiku, blast, good, divers...","[$ 9 go move toshi, call toshi, toshi time bab..."
7,6,29,6_art_artist_painting_value,"[art, artist, painting, value, copyright, step...","[art steal, rich buy artist paint outright eve..."
8,7,26,7_adoption_mass_mainstream_suuuure,"[adoption, mass, mainstream, suuuure, permissi...","[bruh kill mass adoption, mass adoption finall..."
9,8,26,8_quantum_computer_computing_hack,"[quantum, computer, computing, hack, crypto, e...","[I see single quantum, quantum computer go cra..."


In [44]:
trump_topic.to_csv("/content/drive/MyDrive/MasterThesis/trump_topic.csv", index=False)
