# 1. Dataset Check

In [26]:
# Libraries
import pandas as pd
from google.colab import drive
import datetime

In [27]:
# Connect to Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
# read dataset
data = pd.read_csv('/content/drive/MyDrive/MasterThesis/Reddit_Data/cleaned_melania.csv')
data.head()

Unnamed: 0,thread_title,thread_time_utc,thread_score,comment_time_utc,comment_score,comment_text,subreddit,full_text
0,phantom solana wallet,2025-01-19 17:28:00,1,2025-01-19 18:26:54,2,your wallet address is public and everyone can...,solana,phantom solana wallet your wallet address is p...
1,phantom solana wallet,2025-01-19 17:28:00,1,2025-01-21 00:26:06,1,yeah hide those you dont need,solana,phantom solana wallet yeah hide those you dont...
2,phantom solana wallet,2025-01-19 17:28:00,1,2025-01-19 17:28:01,1,warning 1 important read this post to keep you...,solana,phantom solana wallet warning 1 important read...
3,phantom solana wallet,2025-01-19 17:28:00,1,2025-01-20 18:52:10,2,scam never touch an airdrop,solana,phantom solana wallet scam never touch an airdrop
4,phantom solana wallet,2025-01-19 17:28:00,1,2025-01-19 18:01:41,1,thank you any idea how my wallet recieved the ...,solana,phantom solana wallet thank you any idea how m...


In [29]:
data.shape

(16212, 8)

In [30]:
# Droping the data aqfter 28.02.2025 to align with the blockchain dat aanalysis
data['comment_time_utc'] = pd.to_datetime(data['comment_time_utc'])
cutoff = pd.to_datetime('2025-02-28')   # no utc=True
data.drop(data[data['comment_time_utc'] > cutoff].index, inplace=True)

In [31]:
data.shape

(15886, 8)

In [32]:
data.nunique()

Unnamed: 0,0
thread_title,120
thread_time_utc,121
thread_score,65
comment_time_utc,15308
comment_score,292
comment_text,14869
subreddit,10
full_text,15255


In [33]:
# dropping duplicates in order to avoid using bots comments or removed comments from analysis
duplicates = data[data.duplicated(subset='comment_text', keep=False)]
duplicates

Unnamed: 0,thread_title,thread_time_utc,thread_score,comment_time_utc,comment_score,comment_text,subreddit,full_text
2,phantom solana wallet,2025-01-19 17:28:00,1,2025-01-19 17:28:01,1,warning 1 important read this post to keep you...,solana,phantom solana wallet warning 1 important read...
7,ceo of bitcoin magazine says he appreciates $5...,2025-01-19 20:53:39,428,2025-01-19 22:16:55,-9,cope,CryptoCurrency,ceo of bitcoin magazine says he appreciates $5...
19,ceo of bitcoin magazine says he appreciates $5...,2025-01-19 20:53:39,428,2025-01-19 21:53:58,2,,CryptoCurrency,ceo of bitcoin magazine says he appreciates $5...
24,ceo of bitcoin magazine says he appreciates $5...,2025-01-19 20:53:39,428,2025-01-19 21:25:32,-7,cope,CryptoCurrency,ceo of bitcoin magazine says he appreciates $5...
26,ceo of bitcoin magazine says he appreciates $5...,2025-01-19 20:53:39,428,2025-01-19 22:22:12,1,exactly,CryptoCurrency,ceo of bitcoin magazine says he appreciates $5...
...,...,...,...,...,...,...,...,...
15857,just delist melania coin already,2025-02-26 17:56:40,194,2025-02-26 19:03:11,1,removed,CryptoMarkets,just delist melania coin already removed
15878,melania trump meme coin tanks in value just on...,2025-02-27 15:01:46,117,2025-02-27 15:36:03,1,,CryptoMarkets,melania trump meme coin tanks in value just on...
15905,melania trump meme coin tanks in value just on...,2025-02-27 15:01:46,117,2025-02-27 18:40:35,1,you dont say,CryptoMarkets,melania trump meme coin tanks in value just on...
15932,melania trump meme coin tanks in value just on...,2025-02-27 15:01:46,117,2025-02-27 16:20:52,1,no shit,CryptoMarkets,melania trump meme coin tanks in value just on...


In [34]:
data = data.drop_duplicates(subset='comment_text', keep='first')


In [35]:
data.shape

(14870, 8)

In [36]:
data = data[data["comment_text"].str.split().str.len() > 4]

In [37]:
data.shape

(12813, 8)

# 2. Topic Modelling

In [38]:
!pip install bertopic[visualization] umap-learn hdbscan




In [39]:

# Import libraries
import pandas as pd
import umap
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN
import nltk
import re
from nltk.corpus import stopwords
import spacy

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
nlp = spacy.load("en_core_web_sm")

# Text cleaning + lemmatization function
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www.\S+", "", text)
    text = re.sub(r"[^a-zA-Z0-9\s$]", "", text)
    text = re.sub(r"\s+", " ", text).strip()

    # Lemmatize and remove stopwords
    doc = nlp(text)
    words = [token.lemma_ for token in doc if token.lemma_ not in stop_words and not token.is_punct and not token.is_space]

    cleaned_text = " ".join(words)
    return cleaned_text


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [40]:
# Load cleaned dataset from previous part
df = data

# Cleaning using the prepared function
df["clean_comment_text"] = df["comment_text"].dropna().apply(clean_text)
texts = df["clean_comment_text"].dropna().tolist()

# HDBSCAN
hdbscan_model = HDBSCAN(
    min_cluster_size=20,
    min_samples=1,
    prediction_data=True
)

# UMAP model
umap_model = umap.UMAP(
    n_neighbors=10,
    n_components=5,
    min_dist=0.0,
    metric='cosine',
    random_state=42
)

# Embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Create BERTopic model
topic_model = BERTopic(
    hdbscan_model=hdbscan_model,
    umap_model=umap_model,
    embedding_model=embedding_model,
    calculate_probabilities=True,
    language="english",
    verbose=True
)

topics, probs = topic_model.fit_transform(texts)

2025-05-06 21:56:34,478 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/401 [00:00<?, ?it/s]

2025-05-06 21:58:55,172 - BERTopic - Embedding - Completed ✓
2025-05-06 21:58:55,173 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-06 21:59:07,338 - BERTopic - Dimensionality - Completed ✓
2025-05-06 21:59:07,340 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-06 21:59:23,265 - BERTopic - Cluster - Completed ✓
2025-05-06 21:59:23,276 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-06 21:59:23,590 - BERTopic - Representation - Completed ✓


In [63]:
# After checking different options, we needed to reduce the number of topics. Based on the topic visualiations numer 8 seemed the most optimal
topic_model.reduce_topics(texts, nr_topics=9)

2025-05-06 22:01:56,970 - BERTopic - Topic reduction - Reducing number of topics
2025-05-06 22:01:56,986 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-06 22:01:57,135 - BERTopic - Representation - Completed ✓
2025-05-06 22:01:57,139 - BERTopic - Topic reduction - Reduced number of topics from 10 to 9


<bertopic._bertopic.BERTopic at 0x78ec2c004510>

# 3. Visualization and model evaluation

In [64]:
topic_model.visualize_topics()


In [65]:
# Visualize hierarchy of topics
topic_model.visualize_hierarchy()


In [66]:
# Visualize the top topics
topic_model.visualize_barchart(top_n_topics=10)


In [67]:
melania_topic = topic_model.get_topic_info()
melania_topic

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,4242,-1_trump_coin_go_people,"[trump, coin, go, people, buy, get, money, lik...",[I agree go rug pull need money also care imag...
1,0,8056,0_crypto_coin_trump_go,"[crypto, coin, trump, go, people, get, like, b...",[think crypto get elect I think lot people lik...
2,1,189,1_rug_pull_rugpull_people,"[rug, pull, rugpull, people, rugge, get, buy, ...","[go get rug pull, trump pull rug, rug pull trump]"
3,2,92,2_grift_grifter_family_grifte,"[grift, grifter, family, grifte, keep, griftin...","[grift grift people eat, go grift end grift, g..."
4,3,70,3_news_tweet_coffeezilla_twitter,"[news, tweet, coffeezilla, twitter, video, sel...","[people dump news mean, lately bad news dump n..."
5,4,64,4_clown_circus_crazy_insane,"[clown, circus, crazy, insane, vote, elect, ye...","[market clown, every circus need clown, clown ..."
6,5,44,5_burger_wendys_dump_doughnut,"[burger, wendys, dump, doughnut, wake, sell, a...","[get buy next burger, always wendys burger kin..."
7,6,35,6_thank_dm_appreciate_pls,"[thank, dm, appreciate, pls, please, comment, ...","[thank much I really appreciate, dm would grea..."
8,7,21,7_smelania_damn_lol_sminem,"[smelania, damn, lol, sminem, smelanias, milia...","[buy smelania really think, I want buy smelani..."


In [68]:
melania_topic.to_csv("/content/drive/MyDrive/MasterThesis/melania_topic.csv", index=False)
