# Package Tutorial: reddit_ideology

## Step 1: Install Package

In [None]:
#%pip install -e ..

%load_ext autoreload
%autoreload 2

## Step 2: Load Package & config.yaml

In [None]:
import sys, os
import numpy as np
import pandas as pd

PROJECT_ROOT = os.path.abspath(os.path.join(".."))    # from scripts/ up to css_package/
SRC_PATH     = os.path.join(PROJECT_ROOT, "src")      # …/css_package/src
print("Adding to sys.path:", SRC_PATH)
if SRC_PATH not in sys.path:
    sys.path.insert(0, SRC_PATH)

print("sys.path now contains:")
for p in sys.path[:5]:
    print("  ", p)

from reddit_ideology import (
    load_config,
    DataLoader,
    Preprocessor,
    EmbeddingModel,
    TopicModel,
    MetricsCalculator,
    Visualizer
)

config = load_config("config.yaml")
config

## Step 3: Load in Data

In [None]:
dl = DataLoader(
    config['data']['conservative_path'],
    config['data']['liberal_path']
)
cons_df, lib_df = dl.load()

cons_df.head()

## Step 4: Preprocess Data

In [None]:
pp = Preprocessor()
cons_df = pp.apply(cons_df)
lib_df = pp.apply(lib_df)

cons_df[['text','clean_text']].head()

## Step 5: Generate Embeddings

In [None]:
emb_cfg = config['embedding']
embedder = EmbeddingModel(
    model_name=emb_cfg['model_name'],
    batch_size=emb_cfg['batch_size'],
    device=emb_cfg['device'],
    cache_dir=config['output']['cache_dir']
)
cons_emb = embedder.embed(cons_df['clean_text'].tolist(), name="conservative")
lib_emb = embedder.embed(lib_df['clean_text'].tolist(), name="liberal")

print("Shapes:", cons_emb.shape, lib_emb.shape)

## Step 6: Topic Modeling via Clustering

In [None]:
tm_cfg = config['topic_model']['cluster']
topic_model = TopicModel(
    umap_neighbors=tm_cfg['umap_neighbors'],
    umap_min_dist=tm_cfg['umap_min_dist'],
    hdbscan_min_cluster_size=tm_cfg['hdbscan_min_cluster_size'],
    cache_dir=config['output']['cache_dir']
)
cons_topics = topic_model.fit(cons_emb, name="conservative")
lib_topics = topic_model.fit(lib_emb, name="liberal")

print("Conservative topics:", np.unique(cons_topics))
print("Liberal topics:", np.unique(lib_topics))

## Step 7: Compute Metrics

In [18]:
from collections import Counter
from reddit_ideology.openai_utils import init_openai, generate_topic_label
from reddit_ideology.topic_model import EmbeddingClusterTopicModel
from reddit_ideology.metrics import MetricsCalculator

client = init_openai(config['openai']['api_key'])

In [None]:
# === 2) Extract top terms & label topics ===
def extract_top_terms(df, topics, top_n):
    counts = {}
    for tid in sorted(set(topics)):
        texts = df.loc[topics == tid, 'clean_text']
        words = Counter(" ".join(texts).split())
        counts[tid] = [w for w,_ in words.most_common(top_n)]
    return counts

max_terms = config.get('openai', {}).get('max_terms', 10)
# conservative
cons_terms  = extract_top_terms(cons_df, cons_topics, max_terms)
cons_labels = {
    tid: generate_topic_label(client, terms, model=config['openai']['model'])
    for tid, terms in cons_terms.items()
}
# liberal
lib_terms  = extract_top_terms(lib_df, lib_topics, max_terms)
lib_labels = {
    tid: generate_topic_label(client, terms, model=config['openai']['model'])
    for tid, terms in lib_terms.items()
}

# === 3) Compute all metrics ===
mc = MetricsCalculator(config['output']['metrics_dir'])

cons_metrics = mc.topic_entropy_and_count(
    topics=cons_topics,
    timestamps=cons_df['timestamp'],
    freq=config['analysis']['time_interval']
)
lib_metrics = mc.topic_entropy_and_count(
    topics=lib_topics,
    timestamps=lib_df['timestamp'],
    freq=config['analysis']['time_interval']
)

spread_df = mc.semantic_spread(
    embeddings=np.vstack([cons_emb, lib_emb]),
    topics=np.concatenate([cons_topics, lib_topics]),
    timestamps=pd.concat([cons_df['timestamp'], lib_df['timestamp']]),
    freq=config['analysis']['time_interval']
)

intra_cons = mc.intra_group_similarity(
    embeddings=cons_emb,
    timestamps=cons_df['timestamp'],
    freq=config['analysis']['time_interval']
)
intra_lib = mc.intra_group_similarity(
    embeddings=lib_emb,
    timestamps=lib_df['timestamp'],
    freq=config['analysis']['time_interval']
)
cross_sim = mc.cross_group_similarity(
    emb1=cons_emb, emb2=lib_emb,
    ts1=cons_df['timestamp'], ts2=lib_df['timestamp'],
    freq=config['analysis']['time_interval']
)

# === 4) Quick check ===
print("Conservative topic labels:", cons_labels)
print("Liberal topic labels:     ", lib_labels)
cons_metrics.head()

## Step 8: Visualize Results

In [None]:
import pandas as pd
import scipy.stats as stats
from IPython.display import Image, display
from reddit_ideology.config import load_config
from reddit_ideology.visualize import Visualizer

# Load config & parse events
cfg = load_config('config.yaml')
outdir = cfg['output']['plots_dir']
prepost = cfg.get('stats', {}).get('prepost_window', 3)
events = [{'name': ev['name'], 'date': pd.to_datetime(ev['date'])} for ev in cfg.get('events', [])]

# Statistical tests & p-values
years_con = cons_metrics['period'].dt.year.astype(int)
p_ent_con = stats.linregress(years_con, cons_metrics['entropy']).pvalue
years_lib = lib_metrics['period'].dt.year.astype(int)
p_ent_lib = stats.linregress(years_lib, lib_metrics['entropy']).pvalue
print(f"Q1: Entropy trend p-values => conservative={p_ent_con:.3g}, liberal={p_ent_lib:.3g}")

for ev in events:
    before = cross_sim[
        (cross_sim['period'] >= ev['date'] - pd.DateOffset(months=prepost)) &
        (cross_sim['period'] < ev['date'])
    ]['cross_similarity']
    after = cross_sim[
        (cross_sim['period'] > ev['date']) &
        (cross_sim['period'] <= ev['date'] + pd.DateOffset(months=prepost))
    ]['cross_similarity']
    if len(before) and len(after):
        p = stats.ttest_ind(before, after, equal_var=False).pvalue
        print(f"Q2 ({ev['name']}): cross-sim p-value = {p:.3g}")

merged = pd.merge(
    intra_cons.rename(columns={'intra_similarity':'con'}),
    intra_lib.rename(columns={'intra_similarity':'lib'}),
    on='period'
)
p_echo = stats.ttest_rel(merged['con'], merged['lib']).pvalue
print(f"Q3: Intra-group similarity difference p-value = {p_echo:.3g}")

# Visualizations
viz = Visualizer(outdir)
viz.plot_time_series(cons_metrics, 'period', 'entropy', title='Conservative Entropy', filename='cons_entropy.png')
viz.plot_time_series(lib_metrics, 'period', 'entropy', title='Liberal Entropy', filename='lib_entropy.png')
viz.plot_time_series(cons_metrics, 'period', 'topic_count', title='Conservative Topic Count', filename='cons_topic_count.png')
viz.plot_time_series(lib_metrics, 'period', 'topic_count', title='Liberal Topic Count', filename='lib_topic_count.png')

ribbon = spread_df.groupby('period')['spread'].agg(
    median='median',
    q1=lambda x: x.quantile(0.25),
    q3=lambda x: x.quantile(0.75)
).reset_index()
viz.plot_ribbon(ribbon, 'period', 'median', 'q1', 'q3', title='Semantic Spread', filename='semantic_spread.png')

viz.plot_time_series(cross_sim, 'period', 'cross_similarity', title='Cross-Community Similarity', events=events, filename='cross_similarity.png')
viz.plot_time_series(intra_cons, 'period', 'intra_similarity', title='Conservative Echo Chamber', filename='intra_cons.png')
viz.plot_time_series(intra_lib, 'period', 'intra_similarity', title='Liberal Echo Chamber', filename='intra_lib.png')

def freq_df(df, topics):
    return (
        pd.DataFrame({'timestamp': df['timestamp'], 'topic': topics})
        .assign(
            period=lambda d: d['timestamp']
                               .dt.to_period(cfg['analysis']['time_interval'])
                               .dt.to_timestamp()
        )
        .groupby(['period','topic'])
        .size()
        .reset_index(name='count')
    )

cons_freq = freq_df(cons_df, cons_topics)
lib_freq = freq_df(lib_df, lib_topics)
cons_freq['topic_label'] = cons_freq['topic'].map(cons_labels)
lib_freq['topic_label'] = lib_freq['topic'].map(lib_labels)

top_n = cfg.get('analysis', {}).get('top_n', 5)
viz.plot_topic_prevalence(cons_freq, period_col='period', topic_col='topic_label', count_col='count',
                          top_n=top_n, normalize=True,
                          title=f'Top {top_n} Conservative Topics Over Time',
                          filename=f'cons_top{top_n}_topics.png')
viz.plot_topic_prevalence(lib_freq, period_col='period', topic_col='topic_label', count_col='count',
                          top_n=top_n, normalize=True,
                          title=f'Top {top_n} Liberal Topics Over Time',
                          filename=f'lib_top{top_n}_topics.png')
viz.plot_combined_topic_trends(cons_freq, lib_freq, period_col='period', topic_col='topic_label',
                               count_col='count', top_n=top_n, normalize=True,
                               title=f'Top {top_n} Topics: Conservative vs Liberal',
                               filename=f'combined_top{top_n}_topics.png')

# Display all PNGs
for fn in [
    'cons_entropy.png', 'lib_entropy.png',
    'cons_topic_count.png', 'lib_topic_count.png',
    'semantic_spread.png', 'cross_similarity.png',
    'intra_cons.png', 'intra_lib.png',
    f'cons_top{top_n}_topics.png',
    f'lib_top{top_n}_topics.png',
    f'combined_top{top_n}_topics.png'
]:
    display(Image(f"{outdir}/{fn}"))
