In [None]:
%pip install -e .

%load_ext autoreload
%autoreload 2


# Imports & Configuration
import pandas as pd
import numpy as np
import scipy.stats as stats
from collections import Counter

# project modules
from reddit_ideology.openai_utils import init_openai, generate_topic_label
from reddit_ideology.config import load_config
from reddit_ideology.data_loader import DataLoader
from reddit_ideology.preprocessing import Preprocessor
from reddit_ideology.embedding import EmbeddingModel
from reddit_ideology.topic_model import EmbeddingClusterTopicModel
from reddit_ideology.metrics import MetricsCalculator
from reddit_ideology.visualize import Visualizer

# load your config
cfg = load_config("my_config_Q.yaml")
cfg

In [8]:
# Cell 2: Parse Events
# Convert configured events into pandas Timestamps
events = []
for ev in cfg.get("events", []):
    try:
        dt = pd.to_datetime(ev["date"])
        events.append({"name": ev["name"], "date": dt})
    except Exception:
        print(f"Skipping invalid date for event {ev}")
events


[{'name': 'Election 2012', 'date': Timestamp('2012-11-06 00:00:00')},
 {'name': 'Sandy Hook Shooting', 'date': Timestamp('2012-12-14 00:00:00')},
 {'name': 'Election 2016', 'date': Timestamp('2016-11-08 00:00:00')},
 {'name': 'Breonna Taylor Murder', 'date': Timestamp('2020-03-13 00:00:00')},
 {'name': 'COVID-19 Pandemic', 'date': Timestamp('2020-03-01 00:00:00')},
 {'name': 'San Bernardino Shooting', 'date': Timestamp('2015-12-02 00:00:00')},
 {'name': 'Parkland Shooting', 'date': Timestamp('2018-02-14 00:00:00')},
 {'name': 'Election 2020', 'date': Timestamp('2020-11-08 00:00:00')},
 {'name': 'George Floyd Murder', 'date': Timestamp('2020-05-25 00:00:00')}]

In [9]:
# Cell 3: Initialize OpenAI Client
api_key = cfg.get("openai", {}).get("api_key")
client = init_openai(api_key)
client  # should show an initialized OpenAI wrapper

<openai.OpenAI at 0x7f9512058fa0>

In [10]:
# Cell 4: Load & Preprocess Data
# paths come from cfg["data"]
dl = DataLoader(cfg["data"]["conservative_path"], cfg["data"]["liberal_path"])
cons_df, lib_df = dl.load()

pp = Preprocessor()
cons_df = pp.apply(cons_df)
lib_df = pp.apply(lib_df)

# peek
cons_df.head(), lib_df.head()

(            timestamp                                               text  \
 0 2011-05-03 14:01:23  We’re still seeing a flood of calls from both ...   
 1 2011-05-04 16:08:36  In a story about the resurrection of the harsh...   
 2 2011-05-05 06:52:52  Political Byline This blog is no longer active...   
 3 2011-05-05 11:04:22  Our results underscore the decisive relevance ...   
 4 2011-05-05 15:45:56  by Martin and Marcia The most recent double is...   
 
       subreddit                                         clean_text  
 0  conservative  were still seeing a flood of calls from both s...  
 1  conservative  in a story about the resurrection of the harsh...  
 2  conservative  political byline this blog is no longer active...  
 3  conservative  our results underscore the decisive relevance ...  
 4  conservative  by martin and marcia the most recent double is...  ,
             timestamp                                               text  \
 0 2011-05-02 21:32:41  Advertisement 

In [14]:
# Cell 5: Generate Embeddings (with caching)
emb_cfg = cfg["embedding"]
embedder = EmbeddingModel(
    model_name=emb_cfg["model_name"],
    batch_size=emb_cfg["batch_size"],
    device=emb_cfg["device"],
    cache_dir=cfg["output"]["cache_dir"],
)

cons_emb = embedder.embed(cons_df["clean_text"].tolist(), "conservative")
lib_emb  = embedder.embed(lib_df["clean_text"].tolist(),  "liberal")

# shapes
print("cons_emb:", cons_emb.shape, "lib_emb:", lib_emb.shape)


cons_emb: (22128, 768) lib_emb: (22153, 768)


In [11]:
# Cell 6: Fit BERTopic-based Topic Model & Extract Topic IDs
tm_cfg = cfg["topic_model"]["cluster"]  # you can still re-use min_topic_size here

topic_model = EmbeddingClusterTopicModel(
    embedding_model_name=cfg["embedding"]["model_name"],

    # drop very rare / very common filler tokens
    vectorizer_params={
        "ngram_range": (1,2),
        "stop_words": "english",
        "min_df": cfg["analysis"].get("min_df", 10),
        "max_df": cfg["analysis"].get("max_df", 0.90),
    },

    nr_topics=cfg["topic_model"].get("nr_topics", "auto"),
    top_n_words=cfg["analysis"].get("top_n_words", 10),

    # treat this as the minimum cluster size in BERTopic
    min_topic_size=tm_cfg["hdbscan_min_cluster_size"],

    cache_dir=cfg["output"]["cache_dir"],
)

# NOTE: we now pass in the list of cleaned texts, not the embeddings
cons_topics = topic_model.fit(cons_df["clean_text"].tolist(), "conservative")
lib_topics  = topic_model.fit(lib_df["clean_text"].tolist(),   "liberal")

# Sanity check: how many unique topic IDs did we get?
import numpy as np
print("Conservative topics:", np.unique(cons_topics))
print("Liberal topics:    ", np.unique(lib_topics))


2025-05-19 18:21:14,419 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 692/692 [02:16<00:00,  5.09it/s]
2025-05-19 18:23:34,954 - BERTopic - Embedding - Completed ✓
2025-05-19 18:23:34,955 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-19 18:23:39,177 - BERTopic - Dimensionality - Completed ✓
2025-05-19 18:23:39,178 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-19 18:23:41,927 - BERTopic - Cluster - Completed ✓
2025-05-19 18:23:41,928 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-05-19 18:24:10,084 - BERTopic - Representation - Completed ✓
2025-05-19 18:24:10,086 - BERTopic - Topic reduction - Reducing number of topics
2025-05-19 18:24:10,103 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-19 

Conservative topics: [-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17]
Liberal topics:     [-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22
 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44]


In [15]:
# Cell 6: Fit Topic Model & Extract Topic IDs
tm_cfg = cfg["topic_model"]["cluster"]
topic_model = EmbeddingClusterTopicModel(
    umap_neighbors=tm_cfg["umap_neighbors"],
    umap_min_dist=tm_cfg["umap_min_dist"],
    hdbscan_min_cluster_size=tm_cfg["hdbscan_min_cluster_size"],
    cache_dir=cfg["output"]["cache_dir"],
)

cons_topics = topic_model.fit(cons_emb, "conservative")
lib_topics  = topic_model.fit(lib_emb,  "liberal")

# Basic sanity check
np.unique(cons_topics), np.unique(lib_topics)




(array([-1,  0,  1]),
 array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
        16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
        33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
        50]))

In [12]:
# Cell 7: Extract Top Terms per Topic
def extract_top_terms(df, topics, top_n):
    term_counts = {}
    for tid in sorted(set(topics)):
        texts = df.loc[topics == tid, "clean_text"]
        words = Counter(" ".join(texts).split())
        term_counts[tid] = [w for w, _ in words.most_common(top_n)]
    return term_counts

max_terms = cfg.get("openai", {}).get("max_terms", 10)
cons_terms = extract_top_terms(cons_df, cons_topics, max_terms)
lib_terms  = extract_top_terms(lib_df,  lib_topics,  max_terms)

cons_terms, lib_terms


({-1: ['the',
   'to',
   'of',
   'and',
   'a',
   'in',
   'that',
   'is',
   'for',
   'on',
   'it',
   'as',
   'with',
   'was',
   'he',
   'are',
   'be',
   'this',
   'not',
   'i',
   'by',
   'have',
   'you',
   'his',
   'at',
   'they',
   'but',
   'from',
   'who',
   'has',
   'an',
   'we',
   'or',
   'said',
   'their',
   'about',
   'more',
   'all',
   'will',
   'trump'],
  0: ['the',
   'to',
   'of',
   'and',
   'a',
   'in',
   'that',
   'is',
   'for',
   'on',
   'it',
   'with',
   'was',
   'as',
   'he',
   'by',
   'are',
   'not',
   'this',
   'be',
   'have',
   'from',
   'at',
   'his',
   'i',
   'they',
   'but',
   'has',
   'you',
   'an',
   'who',
   'said',
   'or',
   'we',
   'their',
   'about',
   'trump',
   'all',
   'will',
   'were'],
  1: ['the',
   'to',
   'of',
   'and',
   'a',
   'in',
   'that',
   'is',
   'for',
   'on',
   'it',
   'are',
   'as',
   'by',
   'have',
   'with',
   'be',
   'this',
   'not',
   'they',


In [13]:
# Cell 8: Label Topics via OpenAI
cons_labels = {
    tid: generate_topic_label(client, terms, model=cfg["openai"]["model"])
    for tid, terms in cons_terms.items()
}
lib_labels = {
    tid: generate_topic_label(client, terms, model=cfg["openai"]["model"])
    for tid, terms in lib_terms.items()
}

cons_labels, lib_labels


({-1: 'Donald Trump',
  0: 'General News',
  1: 'General Discussion',
  2: 'General Discourse',
  3: 'Gun Debate',
  4: 'General English Text',
  5: 'Abortion Debate',
  6: 'NFL National Anthem',
  7: 'China-U.S. Relations',
  8: 'General English Language',
  9: 'Cuban Politics',
  10: 'U.S. Constitution',
  11: 'Sexual Assault Awareness',
  12: 'U.S. Trade Policy',
  13: 'Government Welfare Programs',
  14: 'Browser Security Check',
  15: 'Email Moderation',
  16: 'Online Gambling',
  17: 'Labor Unions'},
 {-1: 'Political Commentary',
  0: 'Trump Presidency',
  1: 'Democratic Campaign',
  2: 'U.S. Elections',
  3: 'General English Text',
  4: 'Climate Change',
  5: 'General Discourse',
  6: 'Gun Control Debate',
  7: 'Health Insurance',
  8: 'Trump and Coronavirus',
  9: 'Immigration Policy',
  10: 'Political News',
  11: 'Abortion Debate',
  12: 'Political Commentary',
  13: 'Tax Policy Debate',
  14: 'Police and Race',
  15: 'Labor and Wages',
  16: 'Financial Institutions',
  17: '

In [14]:
# Cell 9: Compute Metrics (entropy, counts, spread, similarities)
mc = MetricsCalculator(cfg["output"]["metrics_dir"])

cons_metrics = mc.topic_entropy_and_count(
    cons_topics, cons_df["timestamp"], freq=cfg["analysis"]["time_interval"]
)
lib_metrics = mc.topic_entropy_and_count(
    lib_topics, lib_df["timestamp"], freq=cfg["analysis"]["time_interval"]
)

spread_df = mc.semantic_spread(
    np.vstack([cons_emb, lib_emb]),
    np.concatenate([cons_topics, lib_topics]),
    pd.concat([cons_df["timestamp"], lib_df["timestamp"]]),
    freq=cfg["analysis"]["time_interval"],
)
intra_cons = mc.intra_group_similarity(
    cons_emb, cons_df["timestamp"], freq=cfg["analysis"]["time_interval"]
)
intra_lib  = mc.intra_group_similarity(
    lib_emb,  lib_df["timestamp"], freq=cfg["analysis"]["time_interval"]
)
cross_sim  = mc.cross_group_similarity(
    cons_emb, lib_emb,
    cons_df["timestamp"], lib_df["timestamp"],
    freq=cfg["analysis"]["time_interval"],
)

# Quick look
cons_metrics.head(), lib_metrics.head(), spread_df.head()


NameError: name 'cons_emb' is not defined

In [None]:
# Cell 10: Statistical Tests & P-values
# Q1: trend in entropy
years_con = cons_metrics["period"].dt.year.astype(int)
p_ent_con = stats.linregress(years_con, cons_metrics["entropy"]).pvalue
years_lib = lib_metrics["period"].dt.year.astype(int)
p_ent_lib = stats.linregress(years_lib, lib_metrics["entropy"]).pvalue
print(f"Q1: Entropy trend p-values => conservative={p_ent_con:.3g}, liberal={p_ent_lib:.3g}")

# Q2: pre/post event change in cross similarity
prepost = cfg.get("stats", {}).get("prepost_window", 3)
for ev in events:
    before = cross_sim[
        (cross_sim["period"] >= ev["date"] - pd.DateOffset(months=prepost)) &
        (cross_sim["period"] <  ev["date"])
    ]["cross_similarity"]
    after = cross_sim[
        (cross_sim["period"] >  ev["date"]) &
        (cross_sim["period"] <= ev["date"] + pd.DateOffset(months=prepost))
    ]["cross_similarity"]
    if len(before) and len(after):
        p = stats.ttest_ind(before, after, equal_var=False).pvalue
        print(f"Q2 ({ev['name']}): cross-sim p-value = {p:.3g}")

# Q3: echo chamber difference
merged = pd.merge(
    intra_cons.rename(columns={"intra_similarity": "con"}),
    intra_lib.rename(columns={"intra_similarity": "lib"}),
    on="period",
)
p_echo = stats.ttest_rel(merged["con"], merged["lib"]).pvalue
print(f"Q3: Intra-group similarity difference p-value = {p_echo:.3g}")


In [15]:
# Cell 11: Setup Visualizer & Basic Diversity Plots
viz = Visualizer(cfg["output"]["plots_dir"])

# Conservative & Liberal entropy & topic count
viz.plot_time_series(cons_metrics, "period", "entropy",
                     title="Conservative Entropy", filename="cons_entropy.png")
viz.plot_time_series(lib_metrics,  "period", "entropy",
                     title="Liberal Entropy",    filename="lib_entropy.png")

viz.plot_time_series(cons_metrics, "period", "topic_count",
                     title="Conservative Topic Count", filename="cons_topic_count.png")
viz.plot_time_series(lib_metrics,  "period", "topic_count",
                     title="Liberal Topic Count",    filename="lib_topic_count.png")


In [None]:
# Cell 12: Semantic Spread Ribbon & Convergence
# build the ribbon DataFrame
ribbon = (
    spread_df.groupby("period")["spread"]
    .agg(median="median",
         q1=lambda x: x.quantile(0.25),
         q3=lambda x: x.quantile(0.75))
    .reset_index()
)
viz.plot_ribbon(
    ribbon, "period", "median", "q1", "q3",
    title="Semantic Spread", filename="semantic_spread.png",
    events=events
)

viz.plot_time_series(
    cross_sim, "period", "cross_similarity",
    title="Cross-Community Similarity",
    events=events, filename="cross_similarity.png"
)


In [16]:
# Cell 13: Echo-Chamber Plots
viz.plot_time_series(
    intra_cons, "period", "intra_similarity",
    title="Conservative Echo Chamber", filename="intra_cons.png"
)
viz.plot_time_series(
    intra_lib, "period", "intra_similarity",
    title="Liberal Echo Chamber",    filename="intra_lib.png"
)


NameError: name 'intra_cons' is not defined

In [17]:
# Cell 14: Topic-Prevalence Over Time
def freq_df(df, topics):
    return (
        pd.DataFrame({"timestamp": df["timestamp"], "topic": topics})
          .assign(period=lambda d: d["timestamp"]
                               .dt.to_period(cfg["analysis"]["time_interval"])
                               .dt.to_timestamp())
          .groupby(["period", "topic"])
          .size()
          .reset_index(name="count")
    )

cons_freq = freq_df(cons_df, cons_topics)
lib_freq  = freq_df(lib_df,  lib_topics)

# Map numeric IDs → labels
cons_freq["topic_label"] = cons_freq["topic"].map(cons_labels)
lib_freq["topic_label"]  = lib_freq ["topic"].map(lib_labels)

top_n = cfg.get("analysis", {}).get("top_n", 5)


In [18]:
# Cell 15: Plot Top-N Topic Trends
viz.plot_topic_prevalence(
    cons_freq, period_col="period", topic_col="topic_label",
    count_col="count", top_n=top_n, normalize=True,
    title=f"Top {top_n} Conservative Topics Over Time",
    filename=f"cons_top{top_n}_topics.png", events=events
)
viz.plot_topic_prevalence(
    lib_freq,  period_col="period", topic_col="topic_label",
    count_col="count", top_n=top_n, normalize=True,
    title=f"Top {top_n} Liberal Topics Over Time",
    filename=f"lib_top{top_n}_topics.png", events=events
)


In [19]:
# Cell 16: Combined Topic Trends
viz.plot_combined_topic_trends(
    cons_freq, lib_freq,
    period_col="period", topic_col="topic_label", count_col="count",
    top_n=top_n, normalize=True,
    title=f"Top {top_n} Topics: Conservative vs Liberal",
    filename="combined_top{top_n}_topics.png",
    events=events
)
