In [9]:
import analysis_utils
import importlib
importlib.reload(analysis_utils)



<module 'analysis_utils' from '/Users/jingshu/Projects/machine_learning/cs224w_cb_graph/analysis_utils.py'>

In [10]:
from sentence_transformers import SentenceTransformer
import numpy as np
import json
from tqdm import tqdm

model_name = "all-mpnet-base-v2"

# Faster (384 dim)
# model_name = "all-MiniLM-L6-v2"

model = SentenceTransformer(model_name)
EMB_DIM = model.get_sentence_embedding_dimension()

print(f"Using model: {model_name} (embedding dimension = {EMB_DIM})")

speeches = analysis_utils.load_speeches()
# ==========================================================
# 3. Generate embeddings speech-by-speech
# ==========================================================
missing_text = []
new_embeddings = 0

for sid, s in tqdm(speeches.items(), desc="Embedding speeches"):

    # Extract text
    text = s.get("text", "").strip()

    # If missing or empty → fill with zero-vector
    if text == "":
        speeches[sid]["embedding"] = np.zeros(EMB_DIM, dtype=float).tolist()
        missing_text.append(sid)
        continue

    # Encode the full speech text
    emb = model.encode(text, show_progress_bar=False)

    speeches[sid]["embedding"] = emb.tolist()
    speeches[sid].pop("date")
    new_embeddings += 1
    
print(f"\nEmbeddings generated for {new_embeddings} speeches.")
print(f"Speeches with missing text (zero-vector filled): {len(missing_text)}")

# ==========================================================
# 4. OPTIONAL: Save updated speeches to disk
# ==========================================================
with open("data/speeches_with_embeddings.json", "w") as f:
    json.dump(speeches, f)

print("\nSaved updated speeches as 'speeches_with_embeddings.json'.")


Using model: all-mpnet-base-v2 (embedding dimension = 768)


Embedding speeches: 100%|██████████| 977/977 [00:21<00:00, 45.83it/s]



Embeddings generated for 977 speeches.
Speeches with missing text (zero-vector filled): 0

Saved updated speeches as 'speeches_with_embeddings.json'.
