# Package Tutorial: reddit_ideology

## Step 1: Install Package

In [1]:
%pip install -e ..

%load_ext autoreload
%autoreload 2

## Step 2: Load Package & config.yaml

In [8]:
import sys, os
import numpy as np

PROJECT_ROOT = os.path.abspath(os.path.join(".."))    # from scripts/ up to css_package/
SRC_PATH     = os.path.join(PROJECT_ROOT, "src")      # …/css_package/src
print("Adding to sys.path:", SRC_PATH)
if SRC_PATH not in sys.path:
    sys.path.insert(0, SRC_PATH)

print("sys.path now contains:")
for p in sys.path[:5]:
    print("  ", p)

from reddit_ideology import (
    load_config,
    DataLoader,
    Preprocessor,
    EmbeddingModel,
    TopicModel,
    MetricsCalculator,
    Visualizer
)

config = load_config("../config.yaml")
config

Adding to sys.path: /Users/isaacharlem/Desktop/css_package/src
sys.path now contains:
   /Users/isaacharlem/Desktop/css_package/src
   /opt/anaconda3/envs/css_model/lib/python310.zip
   /opt/anaconda3/envs/css_model/lib/python3.10
   /opt/anaconda3/envs/css_model/lib/python3.10/lib-dynload
   


{'data': {'conservative_path': '../data/cons_data.json',
  'liberal_path': '../data/lib_data.json'},
 'embedding': {'model_name': 'sentence-transformers/all-mpnet-base-v2',
  'batch_size': 128,
  'device': 'mps'},
 'topic_model': {'method': 'cluster',
  'cluster': {'umap_neighbors': 15,
   'umap_min_dist': 0.0,
   'hdbscan_min_cluster_size': 15}},
 'analysis': {'time_interval': 'Y'},
 'events': [{'name': 'Election 2012', 'date': '2012-11-06'},
  {'name': 'ACA Ruling', 'date': '2012-06-28'},
  {'name': 'Election 2016', 'date': '2016-11-08'},
  {'name': 'Obergefell v. Hodges', 'date': '2015-06-26'}],
 'output': {'cache_dir': '../results/cache',
  'plots_dir': '../results/plots',
  'metrics_dir': '../results/metrics'}}

## Step 3: Load in Data

In [10]:
dl = DataLoader(
    config['data']['conservative_path'],
    config['data']['liberal_path']
)
cons_df, lib_df = dl.load()

cons_df.head()

Unnamed: 0,timestamp,text,subreddit
0,2011-05-03 14:01:23,We’re still seeing a flood of calls from both ...,conservative
1,2011-05-04 16:08:36,In a story about the resurrection of the harsh...,conservative
2,2011-05-05 06:52:52,Political Byline This blog is no longer active...,conservative
3,2011-05-05 11:04:22,Our results underscore the decisive relevance ...,conservative
4,2011-05-05 15:45:56,by Martin and Marcia The most recent double is...,conservative


## Step 4: Preprocess Data

In [12]:
pp = Preprocessor()
cons_df = pp.apply(cons_df)
lib_df = pp.apply(lib_df)

cons_df[['text','clean_text']].head()

Unnamed: 0,text,clean_text
0,We’re still seeing a flood of calls from both ...,were still seeing a flood of calls from both s...
1,In a story about the resurrection of the harsh...,in a story about the resurrection of the harsh...
2,Political Byline This blog is no longer active...,political byline this blog is no longer active...
3,Our results underscore the decisive relevance ...,our results underscore the decisive relevance ...
4,by Martin and Marcia The most recent double is...,by martin and marcia the most recent double is...


## Step 5: Generate Embeddings

In [None]:
emb_cfg = config['embedding']
embedder = EmbeddingModel(
    model_name=emb_cfg['model_name'],
    batch_size=emb_cfg['batch_size'],
    device=emb_cfg['device'],
    cache_dir=config['output']['cache_dir']
)
cons_emb = embedder.embed(cons_df['clean_text'].tolist(), name="conservative")
lib_emb = embedder.embed(lib_df['clean_text'].tolist(), name="liberal")

print("Shapes:", cons_emb.shape, lib_emb.shape)

Embedding conservative:  40%|████      | 70/173 [10:22<15:13,  8.87s/it]

## Step 6: Topic Modeling via Clustering

In [None]:
tm_cfg = config['topic_model']['cluster']
topic_model = TopicModel(
    umap_neighbors=tm_cfg['umap_neighbors'],
    umap_min_dist=tm_cfg['umap_min_dist'],
    hdbscan_min_cluster_size=tm_cfg['hdbscan_min_cluster_size'],
    cache_dir=config['output']['cache_dir']
)
cons_topics = topic_model.fit(cons_emb, name="conservative")
lib_topics = topic_model.fit(lib_emb, name="liberal")

print("Conservative topics:", np.unique(cons_topics))
print("Liberal topics:", np.unique(lib_topics))

## Step 7: Compute Metrics

In [None]:
mc = MetricsCalculator(config['output']['metrics_dir'])

cons_metrics = mc.topic_entropy_and_count(
    topics=cons_topics,
    timestamps=cons_df['timestamp'],
    freq=config['analysis']['time_interval']
)
lib_metrics = mc.topic_entropy_and_count(
    topics=lib_topics,
    timestamps=lib_df['timestamp'],
    freq=config['analysis']['time_interval']
)

spread_df = mc.semantic_spread(
    embeddings=np.vstack([cons_emb, lib_emb]),
    topics=np.concatenate([cons_topics, lib_topics]),
    timestamps=pd.concat([cons_df['timestamp'], lib_df['timestamp']]),
    freq=config['analysis']['time_interval']
)

intra_cons = mc.intra_group_similarity(
    embeddings=cons_emb,
    timestamps=cons_df['timestamp'],
    freq=config['analysis']['time_interval']
)
intra_lib = mc.intra_group_similarity(
    embeddings=lib_emb,
    timestamps=lib_df['timestamp'],
    freq=config['analysis']['time_interval']
)
cross_sim = mc.cross_group_similarity(
    emb1=cons_emb, emb2=lib_emb,
    ts1=cons_df['timestamp'], ts2=lib_df['timestamp'],
    freq=config['analysis']['time_interval']
)

cons_metrics.head()

## Step 8: Visualize Results

In [None]:
from IPython.display import Image, display

viz = Visualizer(config['output']['plots_dir'])

# Q1: Diversity
viz.plot_time_series(cons_metrics, 'period', 'entropy',
                     title='Conservative Topic Entropy',
                     filename='test_cons_entropy.png')
viz.plot_time_series(lib_metrics,  'period', 'entropy',
                     title='Liberal Topic Entropy',
                     filename='test_lib_entropy.png')

# Q2: Convergence
viz.plot_time_series(cross_sim, 'period', 'cross_similarity',
                     title='Cross-community Similarity',
                     events=config['events'],
                     filename='test_cross_similarity.png')

# Q3: Echo chambers
viz.plot_time_series(intra_cons, 'period', 'intra_similarity',
                     title='Conservative Intra-group Sim',
                     filename='test_intra_cons.png')
viz.plot_time_series(intra_lib,  'period', 'intra_similarity',
                     title='Liberal Intra-group Sim',
                     filename='test_intra_lib.png')

# Display
for fn in [
    'test_cons_entropy.png',
    'test_lib_entropy.png',
    'test_cross_similarity.png',
    'test_intra_cons.png',
    'test_intra_lib.png',
]:
    display(Image(filename=f"{config['output']['plots_dir']}/{fn}"))