git push -u origin main
- refine this:
    outline process of clustering and sentiment analysis: make sure methodologically solid and produces meaningful results


In [1]:
import pandas as pd

more_info = pd.read_feather("../../data/more_info_political.feather")
more_info['combined_text'] = (
    more_info['main_headline'].fillna('') + ' ' +
    more_info['abstract'].fillna('') + ' ' +
    more_info['lead_paragraph'].fillna('')
)

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,2))
X = vectorizer.fit_transform(more_info['combined_text'])

lda = LatentDirichletAllocation(n_components=10, random_state=0)
lda.fit(X)

# Print top terms per topic
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = f"Topic #{topic_idx+1}: "
        message += ", ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)

print_top_words(lda, vectorizer.get_feature_names_out(), 10)


Topic #1: new, trump, president, year, said, federal, states, war, world, officials
Topic #2: trump, new, president, court, biden, states, year, abortion, climate, united
Topic #3: president, trump, harris, kamala, kamala harris, donald, donald trump, biden, vice, vice president
Topic #4: israel, gaza, israeli, hamas, war, said, trump, cease, president, military
Topic #5: new, trump, president, york, new york, city, year, black, said, people
Topic #6: president, trump, new, biden, people, year, election, city, war, said
Topic #7: trump, president, donald trump, donald, new, tariffs, said, elect, president elect, states
Topic #8: trump, president, new, donald trump, donald, china, war, years, people, said
Topic #9: ukraine, russia, president, said, russian, trump, war, new, ukrainian, military
Topic #10: israel, said, state, minister, president, biden, prime, prime minister, states, netanyahu


In [3]:
# 1) Install what you need (run once in your venv)
# pip install gensim nltk pyldavis

import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
from gensim.utils import simple_preprocess
from gensim.corpora import Dictionary
from gensim.models import LdaModel
import pyLDAvis.gensim_models
import pandas as pd

# Assume you already have more_info['combined_text']
texts_raw = more_info['combined_text'].fillna("")

# 2) Preprocess: tokenize, remove stopwords & short tokens
stop_words = set(stopwords.words("english"))
def preprocess(doc):
    tokens = simple_preprocess(doc, deacc=True)      # lowercase, strip accents/punct
    return [t for t in tokens if t not in stop_words and len(t) > 3]

texts = texts_raw.map(preprocess).tolist()

# 3) Build dictionary & corpus
dictionary = Dictionary(texts)
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=5000)
corpus = [dictionary.doc2bow(t) for t in texts]

# 4) Train & inspect models for different topic counts
def train_and_print(k):
    model = LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=k,
        random_state=42,
        passes=10,
        alpha="auto"
    )
    print(f"\n=== Top words for {k} topics ===")
    for i in range(k):
        print(f"Topic {i+1}:", [w for w,_ in model.show_topic(i, 8)])
    return model

# Your sweet spot: pick 5–10
lda7 = train_and_print(7)

# 5) Visualize the 7-topic model interactively
#vis = pyLDAvis.gensim_models.prepare(lda7, corpus, dictionary)

import pyLDAvis
#pyLDAvis.enable_notebook()   # tell pyLDAvis to render inline
#vis                          # simply evaluating the prepared vis object will display it



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jocelynshek/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



=== Top words for 7 topics ===
Topic 1: ['people', 'coal', 'last', 'black', 'like', 'americans', 'life', 'country']
Topic 2: ['israel', 'israeli', 'gaza', 'said', 'hezbollah', 'officials', 'lebanon', 'beirut']
Topic 3: ['could', 'china', 'tuesday', 'economy', 'strike', 'gulf', 'ports', 'market']
Topic 4: ['climate', 'united', 'states', 'world', 'said', 'china', 'global', 'court']
Topic 5: ['trump', 'president', 'donald', 'harris', 'biden', 'kamala', 'abortion', 'former']
Topic 6: ['ukraine', 'russia', 'east', 'russian', 'president', 'middle', 'first', 'foreign']
Topic 7: ['city', 'nuclear', 'plant', 'power', 'mayor', 'help', 'bank', 'west']


In [4]:



from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
import nltk

topic_number = 18

# --- 0) Prep & preprocess as before ---
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

def preprocess(text):
    tokens = simple_preprocess(text or "", deacc=True)
    return [t for t in tokens if t not in stop_words and len(t) > 3]

docs_raw = more_info["combined_text"].fillna("").tolist()
docs = [" ".join(preprocess(doc)) for doc in docs_raw]

# --- 1) Create your embedding model (deterministic by itself) ---
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# --- 2) Build a UMAP with a fixed seed ---
umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    metric="cosine",
    random_state=100,    # ← fix the seed here
    low_memory=True
)

# --- 3) (Optional) Build an HDBSCAN with deterministic settings ---
hdbscan_model = HDBSCAN(
    min_cluster_size=5, #og 10
    metric="euclidean",
    cluster_selection_method="eom",
    prediction_data=True
    # HDBSCAN doesn’t have a random_state, but this config is stable
)

# --- 4) Plug both into BERTopic ---
topic_model = BERTopic(
    embedding_model=embedder,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    nr_topics=topic_number, #og 10
    verbose=False
)

# --- 5) Fit & transform (now reproducible!) ---
topics, probs = topic_model.fit_transform(docs)

# --- 6) Inspect your 10 topics again ---
topic_info = topic_model.get_topic_info()
valid_topics = topic_info[topic_info.Topic != -1].head(topic_number)["Topic"].tolist()

print("\nTop 10 terms per topic (seeded):\n" + "-"*40)
for t in valid_topics:
    terms = topic_model.get_topic(t)
    words = [w for w,_ in terms[:10]]
    print(f"Topic {t:2d}:", ", ".join(words))


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jocelynshek/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Top 10 terms per topic (seeded):
----------------------------------------
Topic  0: israel, gaza, israeli, hamas, hezbollah, said, iran, military, lebanon, fire
Topic  1: trump, president, abortion, donald, harris, states, biden, kamala, state, voters
Topic  2: ukraine, russia, russian, president, ukrainian, tiktok, putin, moscow, china, said
Topic  3: climate, change, heat, warming, hurricane, global, environmental, world, helene, planet
Topic  4: china, electric, chinese, company, energy, boeing, steel, vehicles, tariffs, trade
Topic  5: black, editor, readers, carter, also, museum, artist, died, beyonce, film
Topic  6: inflation, rates, interest, reserve, federal, rate, market, bank, central, percent
Topic  7: germany, europe, right, france, party, britain, european, government, chancellor, italy
Topic  8: china, india, chinese, modi, hong, kong, pandas, narendra, trudeau, canada
Topic  9: haiti, venezuela, africa, african, country, maduro, president, election, nicolas, kenya
Topic

In [5]:
# Set your parameters up top
topic_number = 18        # how many topics to display at most
min_topic_size = 50      # minimum doc‐count per topic
outlier_id = -1

# 1) Get the topic info table
topic_info = topic_model.get_topic_info()  
# Columns: ['Topic', 'Count', 'Name']

# 2) Identify “big” topics (≥ min_topic_size) and drop outliers
big_topics = topic_info[
    (topic_info.Count >= min_topic_size) &
    (topic_info.Topic != outlier_id)
]["Topic"].tolist()

# 3) If there are more than topic_number, cut off
selected_topics = big_topics[:topic_number]

print(f"Displaying up to {topic_number} topics with ≥{min_topic_size} docs:")
print(selected_topics, "\n")

# 4) Print top-10 terms for each selected topic
print("Top 10 terms per selected topic:\n" + "-"*50)
for t in selected_topics:
    count = topic_info.loc[topic_info.Topic == t, "Count"].values[0]
    terms = topic_model.get_topic(t)   # [(word, score), ...]
    words = [w for w,_ in terms[:10]]
    print(f"Topic {t:2d} ({count} docs): {', '.join(words)}")

# 5) Visualize only those selected topics
fig_map = topic_model.visualize_topics(topics=selected_topics)
#fig_map.show()

fig_bar = topic_model.visualize_barchart(
    topics=selected_topics,
    top_n_topics=len(selected_topics),
    n_words=10
)
#fig_bar.show()


Displaying up to 18 topics with ≥50 docs:
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] 

Top 10 terms per selected topic:
--------------------------------------------------
Topic  0 (2186 docs): israel, gaza, israeli, hamas, hezbollah, said, iran, military, lebanon, fire
Topic  1 (1669 docs): trump, president, abortion, donald, harris, states, biden, kamala, state, voters
Topic  2 (1218 docs): ukraine, russia, russian, president, ukrainian, tiktok, putin, moscow, china, said
Topic  3 (477 docs): climate, change, heat, warming, hurricane, global, environmental, world, helene, planet
Topic  4 (430 docs): china, electric, chinese, company, energy, boeing, steel, vehicles, tariffs, trade
Topic  5 (401 docs): black, editor, readers, carter, also, museum, artist, died, beyonce, film
Topic  6 (367 docs): inflation, rates, interest, reserve, federal, rate, market, bank, central, percent
Topic  7 (263 docs): germany, europe, right, france, party, britain, european, government, chancellor,

In [6]:
#ok actually works fr may 12

import json
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer
import networkx as nx
import nltk
from nltk.corpus import stopwords

# Download standard stopwords if you haven't already
nltk.download('stopwords')

# Start with standard English stopwords
custom_stopwords = set(stopwords.words('english'))

# Add your domain-specific words that you find too generic or uninformative
additional_stopwords = [
    # Generic political names/terms (you can add or remove as needed)
    "york times", "years ago", "year old", "today episode",
    "guest", "essay",
    "content_kicker", "kicker", "print_headline"
]

# Combine the lists
custom_stopwords = custom_stopwords.union(set(additional_stopwords))

#originally included overlapping phrases: manual check for famous people
ENTITY_MAP = {
    "prime minister benjamin": "benjamin netanyahu",
    "minister benjmain netanyahu": "benjamin netanyahu",
    "minister benjamin": "benjamin netanyahu",
    "benjamin netanyahu israel": "benjamin netanyahu",
    "president donald": "donald trump",
    "president trump": "donald trump",
    "former donald": "donald trump",
    "former donald trump": "donald trump",
    "president elect donald": "donald trump",
    "elect donald trump": "donald trump",
    "donald trump trump": "donald trump",
    "trump trump": "donald trump",
    "vice president kamala": "kamala harris",
    "president kamala harris": "kamala harris",
    "mayor eric": "eric adams",
    "mayor eric adams": "eric adams",
    "state antony": "antony blinken",
    "secretary state antony": "antony blinken",
    "street journal": "wall street journal",
    "president volodymyr zelensky": "volodymyr zelensky",
    "president volodymyr": "volodymyr zelensky",
    "volodymyr zelensky ukraine": "volodymyr zelensky",
    "volodymyr ukraine": "volodymyr zelensky",
    "president vladimir putin": "vladimir putin",
    "president vladimir": "volodymyr zelensky",
    "volodymyr ukraine": "volodymyr zelensky",
    "minister justin": "justin trudeau",
    "prime minister justin": "justin trudeau",
    "minister justin trudeau": "justin trudeau",
    "trudeau canada": "justin trudeau",
    "justin trudeau canada": "justin trudeau",
    # etc.
}

import re

def remove_duplicate_phrases(text):
    return re.sub(r'\b(\w+)\s+\1\b', r'\1', text)

# Update normalize_doc
def normalize_doc(text):
    for k, v in ENTITY_MAP.items():
        pattern = r'\b' + re.escape(k) + r'\b'
        text = re.sub(pattern, v, text, flags=re.IGNORECASE)
    text = remove_duplicate_phrases(text)
    return text

# --- Updated build_network with size & community group ---
def build_network(topic_id, docs, topics, min_cooccurrence=3, top_n_phrases=50):
    # 1) Filter docs for this topic
    topic_docs = [docs[i] for i, t in enumerate(topics) if t == topic_id]
    topic_docs = [normalize_doc(doc) for doc in topic_docs]
    
    # 2) Extract bigrams/trigrams
    
    vec = CountVectorizer(ngram_range=(2, 3), stop_words=list(custom_stopwords), max_features=top_n_phrases)
    
    X = vec.fit_transform(topic_docs)
    phrases = vec.get_feature_names_out()
    
    # 3) Calculate frequencies
    freqs = dict(zip(phrases, X.sum(axis=0).A1))
    
    # 4) Build co-occurrence counts
    co = defaultdict(int)
    for doc in topic_docs:
        present = [p for p in phrases if p in doc]
        for i in range(len(present)):
            for j in range(i+1, len(present)):
                a, b = sorted([present[i], present[j]])
                co[(a, b)] += 1
    
    # 5) Create graph
    G = nx.Graph()
    for phrase in phrases:
        G.add_node(phrase, frequency=int(freqs.get(phrase, 0)))
    for (a, b), w in co.items():
        if w >= min_cooccurrence:
            G.add_edge(a, b, weight=int(w))
    
    # 6) Community detection for grouping/color
    communities = list(nx.community.greedy_modularity_communities(G))
    comm_map = {}
    for idx, comm in enumerate(communities):
        for node in comm:
            comm_map[node] = idx
    
    # 7) Serialize nodes and links with size & group
    nodes = []
    for node, data in G.nodes(data=True):
        nodes.append({
            "id": node,
            "size": data["frequency"],        # node size
            "group": comm_map.get(node, -1)   # community id
        })
    links = []
    for u, v, data in G.edges(data=True):
        links.append({
            "source": u,
            "target": v,
            "value": data["weight"]           # edge weight
        })
    
    return {"topic": topic_id, "nodes": nodes, "links": links}

# --- Build and write JSON ---
# Assume `docs` and `topics` (from topic_model.transform) defined, and `selected_topics` list exists
networks = [build_network(tid, docs, topics, min_cooccurrence=5, top_n_phrases=50)
            for tid in selected_topics]

with open("joc-data/networks.json", "w") as f:
    json.dump(networks, f, indent=2)

print(f"Wrote {len(networks)} topic networks with size & group to joc-data/networks.json")


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jocelynshek/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Wrote 15 topic networks with size & group to joc-data/networks.json


In [7]:
topics_meta = []

for topic_id in valid_topics:
    words = [w for w, _ in topic_model.get_topic(topic_id)[:10]]
    terms = topic_model.get_topic(topic_id)[:10]  # top 10 (word, weight) pairs
    top_words = [{"word": word, "value": float(score)} for word, score in terms]
    label = ", ".join(words)
    
    # Get frequency (i.e. number of documents assigned to this topic)
    count = topic_info[topic_info["Topic"] == topic_id]["Count"].values[0]
    
    # Get topic embedding coords — assumes 2D
    idx = topic_info[topic_info["Topic"] == topic_id].index[0]
    x, y = topic_model.topic_embeddings_[idx][:2]

    topics_meta.append({
        "id": topic_id,
        "label": label,
        "topWords": top_words,
        "count": int(count),
        "x": float(x),
        "y": float(y)
    })

with open("joc-data/topics.json","w") as f:
    json.dump(topics_meta, f, indent=2)

In [8]:
docs_raw = more_info["combined_text"].fillna("").tolist()
docs = [" ".join(preprocess(doc)) for doc in docs_raw]

# Create a filtered version of more_info to match topics
more_info_filtered = more_info.loc[:len(topics) - 1].copy()

topic_supergroups = {
    0: "International geopolitics",
    1: "Internal politics",
    2: "International geopolitics",
    3: "Climate",
    4: "International geopolitics",
    5: "Culture",
    6: "Internal politics",
    7: "International geopolitics",
    8: "International geopolitics",
    9: "International geopolitics",
    10: "Internal politics",
    11: "Internal politics",
    12: "Culture",
    13: "Culture",
    14: "Internal politics",
    15: "Culture",
    16: "Internal politics"
}

manualLabels = {
    0: "Israeli–Palestinian conflict",
    1: "US Politics & Elections",
    2: "Russia-Ukraine War, Global Politics",
    3: "Climate Change & Extreme Weather",
    4: "China, Trade, & Energy Industry",
    5: "Culture, Arts, Obituaries",
    6: "Economy",
    7: "European Politics",
    8: "India-China Relations & Global Leaders",
    9: "Global South: Haiti, Africa, Venezuela",
    10: "Crime, Trials & Policing",
    11: "Urban Policy",
    12: "Health",
    13: "International Sports",
    14: "Banking",
    15: "Pope, Church, & Religion",
    16: "Security"
}

topics_meta = []

for topic_id in valid_topics:
    words = [w for w, _ in topic_model.get_topic(topic_id)[:10]]



    # Get top 10 words for topic
    terms = topic_model.get_topic(topic_id)[:10]
    top_words = []

    # Find indices of docs assigned to this topic
    doc_indices = [i for i, t in enumerate(topics) if t == topic_id]
    topic_docs = [docs[i] for i in doc_indices]
    num_docs = len(topic_docs)

    for word, _ in terms:
        count = sum(1 for doc in topic_docs if word in doc.split())
        percent = count / num_docs if num_docs else 0
        top_words.append({
            "word": word,
            "value": round(percent * 100, 1)  # as percentage
        })



    label = ", ".join(words)
    count = topic_info[topic_info["Topic"] == topic_id]["Count"].values[0]
    

    topics_meta.append({
        "id": topic_id,
        "label": label,
        "count": int(count),
        "topWords": top_words
    })


for topic in topics_meta:
    topic["group"] = topic_supergroups.get(topic["id"], "Other")

for topic in topics_meta:
    topic["manualLabel"] = manualLabels.get(topic["id"], "Other")

with open("joc-data/topics.json", "w") as f:
    json.dump(topics_meta, f, indent=2)
