In [2]:
import json
from tqdm import tqdm

In [3]:
with open('conversations.json', 'r') as f:
    conversations = json.load(f)

In [4]:
def get_msg(convo):
    mapping = convo['mapping']
    user_msgs = []
    
    for node in mapping.keys():
        if mapping[node]['message']:
            if 'parts' in mapping[node]['message']['content'].keys() and mapping[node]['message']['author']['role'] == 'user':
                if isinstance(mapping[node]['message']['content']['parts'][0], str):
                    user_msgs.append(mapping[node]['message']['content']['parts'])
    return user_msgs

In [5]:
all_user_msgs = []
for user_msg in tqdm(conversations):
    all_user_msgs.extend(get_msg(user_msg))

len(all_user_msgs)

100%|██████████| 3261/3261 [00:00<00:00, 117873.98it/s]


14157

In [7]:
from openai import OpenAI
import os 
from dotenv import load_dotenv
import tiktoken

load_dotenv()

MAX_CONTEXT_LENGTH = 8192
EMBEDDING_MODEL = "text-embedding-3-small"

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def clean_msg(msg, max_length=MAX_CONTEXT_LENGTH, embedding_model=EMBEDDING_MODEL):
    msg = msg[0]
    if len(msg) == 0: # empty message
        return None
    encoding = tiktoken.encoding_for_model(embedding_model)
    tokens = encoding.encode(msg, disallowed_special=set())
    if len(tokens) > max_length:
        tokens = tokens[-max_length:]
    return encoding.decode(tokens)

texts_to_embed = []
for msg in all_user_msgs:
    cleaned_msg = clean_msg(msg)
    if cleaned_msg:
        texts_to_embed.append(cleaned_msg)

len(texts_to_embed)


14108

In [9]:
BATCH_SIZE = 50 # worst case only set to 35

embeddings = []
for i in tqdm(range(0, len(texts_to_embed), BATCH_SIZE), desc="Embedding batches"):
    batch = texts_to_embed[i:i+BATCH_SIZE]
    try:
        response = client.embeddings.create(input=batch, model=EMBEDDING_MODEL)
        batch_embeddings = [data.embedding for data in response.data]
        embeddings.extend(batch_embeddings)
    except Exception as e:
        print(f"Error embedding batch {i}: {e}")
        break

print(f"Total embeddings: {len(embeddings)}")

Embedding batches: 100%|██████████| 283/283 [03:04<00:00,  1.53it/s]

Total embeddings: 14108





In [49]:
from sklearn.cluster import KMeans, SpectralClustering, HDBSCAN

# kmeans = KMeans(n_clusters=5, random_state=42)
# kmeans.fit(embeddings)
# labels = kmeans.labels_

# spectral_clustering = SpectralClustering(n_clusters=5, affinity='nearest_neighbors', random_state=42)
# spectral_clustering.fit(embeddings)
# labels = spectral_clustering.labels_

hdbscan = HDBSCAN(min_cluster_size=30, metric='cosine', cluster_selection_method='eom')
hdbscan.fit(embeddings)
labels = hdbscan.labels_

In [50]:
import numpy as np

hdbscan_label_mapping = {}

for label in np.unique(labels):
    indices = np.where(labels == label)[0]
    messages = [all_user_msgs[i][0] for i in indices]
    hdbscan_label_mapping[label] = messages

for label in sorted(hdbscan_label_mapping.keys()):
    count = len(hdbscan_label_mapping[label])
    if label == -1:
        print(f"Cluster {label} has {count} messages")
    else:
        print(f"Cluster {label} has {count} messages")


Cluster -1 has 12964 messages
Cluster 0 has 57 messages
Cluster 1 has 30 messages
Cluster 2 has 35 messages
Cluster 3 has 49 messages
Cluster 4 has 39 messages
Cluster 5 has 170 messages
Cluster 6 has 94 messages
Cluster 7 has 66 messages
Cluster 8 has 82 messages
Cluster 9 has 35 messages
Cluster 10 has 38 messages
Cluster 11 has 84 messages
Cluster 12 has 35 messages
Cluster 13 has 34 messages
Cluster 14 has 230 messages
Cluster 15 has 66 messages


In [56]:
from openai import OpenAI
client = OpenAI()

all_cluster_info = []
cluster_summaries = {i : None for i in hdbscan_label_mapping.keys()}

for i in tqdm(sorted(hdbscan_label_mapping.keys())):
    cluster_msgs = hdbscan_label_mapping[i][:300]
    cluster_msgs_text = " | ".join([msg for msg in cluster_msgs])
    prompt = f"""You are analyzing a cluster of similar user messages. Based on the examples below, create a concise descriptive label (2-4 words) that captures the main topic or theme.

        Messages in this cluster:
        {cluster_msgs_text}

        Instructions:
        - Identify the common theme or topic
        - Use 2-4 words
        - Be specific and descriptive
        - Return only the label name

        Cluster name:"""

    response = client.chat.completions.create(
        model="gpt-5-nano",
        messages=[{"role": "user", "content": prompt}],
    )
    output = response.choices[0].message.content
    cluster_summaries[i] = output

cluster_summaries

100%|██████████| 17/17 [01:46<00:00,  6.25s/it]


{-1: 'Palantir interview prep',
 0: 'Semantic navigation with LLMs',
 1: 'RISC-V Instruction Decoder',
 2: 'Machine learning optimization',
 3: 'Q-learning Implementation',
 4: 'DynamoDB to OpenSearch',
 5: 'Bluespec CPU pipeline',
 6: 'Robotics software development',
 7: 'Code debugging help',
 8: 'Cluster name: Code Debugging',
 9: 'ROS2 Gazebo Troubleshooting',
 10: 'Programming Debugging Challenges',
 11: 'Code debugging questions',
 12: 'Bitonic sorting networks',
 13: 'Multi-domain Q&A',
 14: 'LLM-guided indoor navigation',
 15: 'ROS2 Navigation Visualization'}

In [51]:
print(output)

Cluster 0: LLM-assisted indoor navigation research paper drafting
Cluster 1: RISC-V decoding & HDL debugging
Cluster 2: Advanced math/CS prompts (linear algebra and optimization)
Cluster 3: CS interview prep & coding error debugging
Cluster 4: Robotics research projects & assets data wrangling
Cluster 5: CPU pipeline fetch/decode and hardware design debugging
Cluster 6: Robotics startup grant idea development & competitive landscape
Cluster 7: Robotics design writing and impedance control question
Cluster 8: Traveling Repairman DP and AI writing prompts
Cluster 9: ROS2 map setup and navigation debugging
Cluster 10: KNN coding & probability/inference exercises
Cluster 11: Differential equations and linear algebra fundamentals
Cluster 12: Linguistics concepts + outreach writing
Cluster 13: Danish syntax and English sentence structure analysis
Cluster 14: Control theory, disturbances, SPL, and project ideation
Cluster 15: AB-MCTS frontier AI research blog overview


In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import numpy as np

embeddings = np.array(embeddings)
tsne = TSNE(n_components=3, random_state=42)
tsne_result = tsne.fit_transform(embeddings)
embeddings_2d = tsne.fit_transform(embeddings)