In [3]:
# If running in Colab, uncomment the next line
# !pip -q install bertopic[visualization] sentence-transformers datasets umap-learn hdbscan plotly pyarrow


In [4]:
import itertools, re, random, math
from datetime import datetime
from collections import Counter
import numpy as np
import pandas as pd

from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
import umap, hdbscan
import plotly.io as pio

# Reproducibility
random.seed(42); np.random.seed(42)

# Plotly in notebooks
pio.renderers.default = "colab"  # or "notebook" / "vscode" depending on your env

# ---- User-tunable knobs ----
N_DOCS        = 50_000      # start 20k–50k; scale if you have the compute
DATE_FROM     = "2015-01-01"
DATE_TO       = "2025-01-01"
TARGET_SUBS   = None        # e.g., {"technology","politics","AskReddit"} or None for all
MIN_CHARS     = 40          # drop ultra-short posts/comments
LANGUAGE_GUARD = True       # quick English-ish heuristic
EMBEDDER_NAME = "sentence-transformers/all-MiniLM-L6-v2"  # fast; switch to all-mpnet-base-v2 for higher quality
FORCE_NR_TOPICS = None      # e.g., 200 to force topic reduction; None lets HDBSCAN decide
MIN_CLUSTER_SIZE = 60       # raise to merge tiny clusters; lower to allow more topics
N_TOPICS_FOR_PLOTS = 20     # how many top topics to show in barchart/time plots
TIME_AGG = "month"          # "month" or "day" (month recommended)
SAVE_DIR = "outputs_bertopic_reddit"


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import os
os.makedirs(SAVE_DIR, exist_ok=True)

def clean_text(txt: str) -> str:
    # Basic cleaning: strip URLs, extra whitespace; keep punctuation for semantics
    txt = re.sub(r"http[s]?://\S+|www\.\S+", " ", txt)
    txt = re.sub(r"\s+", " ", txt).strip()
    return txt

def is_englishish(txt: str) -> bool:
    # Very rough: exclude chunks with lots of CJK/Cyrillic/etc. Keep ASCII/Latin mostly.
    # Remove this if you want multilingual topics.
    return not re.search(r"[一-龥ぁ-ゟ゠-ヿԱ-ՖА-Яа-яא-תء-ي]", txt)

def to_period(ts: str):
    # Dataset uses ISO-like strings; we format to yyyy-mm when TIME_AGG="month"
    if not ts:
        return None
    try:
        if TIME_AGG == "month":
            return ts[:7]  # yyyy-mm
        elif TIME_AGG == "day":
            return ts[:10] # yyyy-mm-dd
        else:
            return ts
    except Exception:
        return None


In [6]:
# Streaming avoids downloading the full dataset
ds_iter = load_dataset("gk4u/reddit_dataset_28", split="train", streaming=True)

def row_ok(r) -> bool:
    dt = r.get("datetime", "")
    txt = (r.get("text") or "").strip()
    if not txt or len(txt) < MIN_CHARS:
        return False
    if not (DATE_FROM <= dt <= DATE_TO):
        return False
    if TARGET_SUBS and r.get("communityName") not in TARGET_SUBS:
        return False
    if LANGUAGE_GUARD and not is_englishish(txt):
        return False
    return True

subset_iter = (r for r in ds_iter if row_ok(r))
sample = []
for r in subset_iter:
    # basic in-loop cleaning
    r["text"] = clean_text(r["text"])
    if r["text"]:
        sample.append(r)
    if len(sample) >= N_DOCS:
        break

len(sample), sample[0].keys() if sample else None


(50000,
 dict_keys(['text', 'label', 'dataType', 'communityName', 'datetime', 'username_encoded', 'url_encoded']))

In [7]:
docs        = [r["text"] for r in sample]
timestamps  = [r.get("datetime","") for r in sample]
periods     = [to_period(ts) for ts in timestamps]
subs        = [r.get("communityName","") for r in sample]

# Deduplicate exact duplicates to reduce noise (optional)
dedup_map = {}
final_docs, final_periods, final_subs = [], [], []
for d, p, s in zip(docs, periods, subs):
    if d not in dedup_map:
        dedup_map[d] = 1
        final_docs.append(d)
        final_periods.append(p)
        final_subs.append(s)

print(f"Kept {len(final_docs)} unique docs out of {len(docs)}")


Kept 48345 unique docs out of 50000


In [8]:
embedder = SentenceTransformer(EMBEDDER_NAME)
embeddings = embedder.encode(
    final_docs,
    batch_size=128,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True
)
embeddings.shape


Batches: 100%|██████████| 378/378 [11:12<00:00,  1.78s/it]


(48345, 384)

In [9]:
umap_model = umap.UMAP(
    n_neighbors=15, n_components=5, min_dist=0.0,
    metric="cosine", random_state=42
)
hdbscan_model = hdbscan.HDBSCAN(
    min_cluster_size=MIN_CLUSTER_SIZE,
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=True
)

topic_model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    nr_topics=FORCE_NR_TOPICS,            # None lets HDBSCAN decide
    language="english",
    calculate_probabilities=True,
    verbose=True
)

topics, probs = topic_model.fit_transform(final_docs, embeddings)
topic_info = topic_model.get_topic_info()
topic_info.head(10)


2025-10-30 23:01:25,697 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-10-30 23:02:10,376 - BERTopic - Dimensionality - Completed ✓
2025-10-30 23:02:10,378 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-10-30 23:02:34,638 - BERTopic - Cluster - Completed ✓
2025-10-30 23:02:34,731 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-10-30 23:02:40,474 - BERTopic - Representation - Completed ✓


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,19111,-1_the_and_to_you,"[the, and, to, you, of, it, that, in, is, for]","[Warning: Long story. &#x200B; July of 2002, w..."
1,0,2759,0_trump_the_of_president,"[trump, the, of, president, in, by, to, that, ...","[Welcome to **Lost in the Sauce**, keeping you..."
2,1,1128,1_was_me_he_my,"[was, me, he, my, she, and, had, her, the, to]",[I am not usually one to take revenge. I have ...
3,2,1074,2_area_any_in_looking,"[area, any, in, looking, anyone, im, there, ar...",[Hi I(M25) am planning to move from South Afri...
4,3,1068,3_subreddit_post_posts_sub,"[subreddit, post, posts, sub, you, this, rules...",[This post contains a breakdown of the rules a...
5,4,793,4_job_you_ux_experience,"[job, you, ux, experience, for, jobs, in, desi...",[This aims to capture what a lot of quant firm...
6,5,729,5_app_ios_beta_iphone,"[app, ios, beta, iphone, ab, alien, 14, apps, ...","[so, i had an iphone 7 plus with ios 14 beta 5..."
7,6,689,6_clan_discord_we_destiny,"[clan, discord, we, destiny, join, active, our...",[We are an Adults only (18+) Mainly U.S. PC De...
8,7,644,7_show_season_episode_episodes,"[show, season, episode, episodes, series, it, ...",[It’s the first season I always think of start...
9,8,629,8_switch_game_vita_windows,"[switch, game, vita, windows, pc, on, your, it...",[Download and more information: [ Recent Chang...


In [10]:
# If you ended up with too many tiny topics, force a target number (e.g., 200)
if FORCE_NR_TOPICS and isinstance(FORCE_NR_TOPICS, int):
    topic_model, _ = topic_model.reduce_topics(final_docs, nr_topics=FORCE_NR_TOPICS)
    topics = topic_model.topics_
    topic_info = topic_model.get_topic_info()

print(f"Total topics (including -1 outliers): {topic_info.shape[0]-1}")
topic_info.head(10)


Total topics (including -1 outliers): 113


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,19111,-1_the_and_to_you,"[the, and, to, you, of, it, that, in, is, for]","[Warning: Long story. &#x200B; July of 2002, w..."
1,0,2759,0_trump_the_of_president,"[trump, the, of, president, in, by, to, that, ...","[Welcome to **Lost in the Sauce**, keeping you..."
2,1,1128,1_was_me_he_my,"[was, me, he, my, she, and, had, her, the, to]",[I am not usually one to take revenge. I have ...
3,2,1074,2_area_any_in_looking,"[area, any, in, looking, anyone, im, there, ar...",[Hi I(M25) am planning to move from South Afri...
4,3,1068,3_subreddit_post_posts_sub,"[subreddit, post, posts, sub, you, this, rules...",[This post contains a breakdown of the rules a...
5,4,793,4_job_you_ux_experience,"[job, you, ux, experience, for, jobs, in, desi...",[This aims to capture what a lot of quant firm...
6,5,729,5_app_ios_beta_iphone,"[app, ios, beta, iphone, ab, alien, 14, apps, ...","[so, i had an iphone 7 plus with ios 14 beta 5..."
7,6,689,6_clan_discord_we_destiny,"[clan, discord, we, destiny, join, active, our...",[We are an Adults only (18+) Mainly U.S. PC De...
8,7,644,7_show_season_episode_episodes,"[show, season, episode, episodes, series, it, ...",[It’s the first season I always think of start...
9,8,629,8_switch_game_vita_windows,"[switch, game, vita, windows, pc, on, your, it...",[Download and more information: [ Recent Chang...


In [11]:
# Top terms for a topic (replace 1 with any topic id)
print(topic_model.get_topic(1)[:15])

# Representative docs for topic 1
rep_docs = topic_model.get_representative_docs(1)
rep_docs[:3]


[('was', np.float64(0.011630104182852728)), ('me', np.float64(0.011110411887229403)), ('he', np.float64(0.010525473177760095)), ('my', np.float64(0.009771434025127772)), ('she', np.float64(0.009428824653970812)), ('and', np.float64(0.009250229939773282)), ('had', np.float64(0.009137418191980723)), ('her', np.float64(0.008232554635222253)), ('the', np.float64(0.008090823226692382)), ('to', np.float64(0.007980755128298218))]


 'Some Short Backstory: So this happened back when I was in highschool, in the 11th grade and I was about 16 or 17 at the time. I always worked with the drama productions, and the only adult working there would be the drama teacher himself, Mr. B, cool dude. He was actually my teacher at the time as well, so I was getting extra credit for each night I worked. As for what I did, I basically did everything except work tech and actually perform. So that means, I cleaned before and after shows, sold tickets and concessions, helped with costumes and hair, sometimes help with setting scenes, but my main job was being an usher. I man the door, making sure that only people with tickets (except for a few exceptions like staff) got in and no one sneaked in, as well as helping people find seats if they need help. Sometimes I\'m asked to go grab some extra chairs, though we have limited space for that, so I can only really add about 10 extra seats. &#x200B; Now, on to the actual story. There rarel

In [12]:
# 9.1 Global 2D topic map
fig = topic_model.visualize_topics()
fig.show()

# 9.2 Barchart of top topics
fig = topic_model.visualize_barchart(top_n_topics=N_TOPICS_FOR_PLOTS)
fig.show()

# 9.3 Topic hierarchy / dendrogram
fig = topic_model.visualize_hierarchy()
fig.show()


In [13]:
# Ensure periods are present; drop Nones
docs_time = []
period_time = []
for d, p in zip(final_docs, final_periods):
    if p is not None:
        docs_time.append(d)
        period_time.append(p)

topics_over_time = topic_model.topics_over_time(
    docs=docs_time,
    timestamps=period_time,
    global_tuning=True,
    evolution_tuning=True
)

fig = topic_model.visualize_topics_over_time(
    topics_over_time,
    top_n_topics=N_TOPICS_FOR_PLOTS
)
fig.show()


72it [01:11,  1.00it/s]


In [14]:
df_topics = pd.DataFrame({
    "topic": topics,
    "sub": final_subs
})

topic_by_sub = (
    df_topics[df_topics["topic"] != -1]         # exclude outliers
    .value_counts(["sub","topic"])
    .reset_index(name="n")
    .sort_values(["sub","n"], ascending=[True, False])
)

# Top 10 topics per subreddit (if you filtered to a few subs)
top_per_sub = (
    topic_by_sub
    .groupby("sub", group_keys=False)
    .apply(lambda x: x.head(10))
)

top_per_sub.head(30)


Unnamed: 0,sub,topic,n
864,r/1200isfineIGUESSugh,27,2
483,r/13ReasonsWhy,35,5
1455,r/13ReasonsWhy,19,2
3023,r/1500isplenty,3,1
3020,r/24hoursupport,77,1
884,r/2d20games,34,2
887,r/2d20games,18,2
2968,r/2d20games,41,1
3006,r/2d20games,38,1
3012,r/2d20games,30,1


In [15]:
topic_model.save(os.path.join(SAVE_DIR, "bertopic_model"))
topic_info.to_csv(os.path.join(SAVE_DIR, "topic_info.csv"), index=False)

# Save topics-over-time table
pd.DataFrame(topics_over_time).to_csv(os.path.join(SAVE_DIR, "topics_over_time.csv"), index=False)

# Save top-per-subreddit table
top_per_sub.to_csv(os.path.join(SAVE_DIR, "top_topics_per_subreddit.csv"), index=False)

print("Saved to:", SAVE_DIR)




Saved to: outputs_bertopic_reddit
