In [8]:
import json
from tqdm import tqdm

In [9]:
with open('conversations.json', 'r') as f:
    conversations = json.load(f)

In [10]:
def get_msg(convo):
    mapping = convo['mapping']
    user_msgs = []
    
    for node in mapping.keys():
        if mapping[node]['message']:
            if 'parts' in mapping[node]['message']['content'].keys() and mapping[node]['message']['author']['role'] == 'user':
                if isinstance(mapping[node]['message']['content']['parts'][0], str):
                    user_msgs.append(mapping[node]['message']['content']['parts'])
    return user_msgs

In [11]:
all_user_msgs = []
for user_msg in tqdm(conversations):
    all_user_msgs.extend(get_msg(user_msg))

len(all_user_msgs)

100%|██████████| 3261/3261 [00:00<00:00, 132211.01it/s]


14157

In [None]:
from openai import OpenAI
import os 
from dotenv import load_dotenv
import tiktoken

load_dotenv()

MAX_CONTEXT_LENGTH = 8192
EMBEDDING_MODEL = "text-embedding-3-small"

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
embeddings = []

for t in tqdm(all_user_msgs):
    string_t = t[0]
    embedding = tiktoken.encoding_for_model(EMBEDDING_MODEL).encode(string_t)
    num_tokens = len(embedding)
    try:
        if num_tokens > MAX_CONTEXT_LENGTH:
            string_t = string_t[-MAX_CONTEXT_LENGTH:]
        embedding = client.embeddings.create(input=string_t, model=EMBEDDING_MODEL).data[0].embedding
        embeddings.append(embedding)
    except Exception as e:
        print(e)
        break


In [28]:
from openai import OpenAI
import os 
from dotenv import load_dotenv
import tiktoken

load_dotenv()

MAX_CONTEXT_LENGTH = 8192
EMBEDDING_MODEL = "text-embedding-3-small"

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def clean_msg(msg, max_length=MAX_CONTEXT_LENGTH, embedding_model=EMBEDDING_MODEL):
    msg = msg[0]
    if len(msg) == 0: # empty message
        return None
    encoding = tiktoken.encoding_for_model(embedding_model)
    tokens = encoding.encode(msg, disallowed_special=set())
    if len(tokens) > max_length:
        tokens = tokens[-max_length:]
    return encoding.decode(tokens)

texts_to_embed = []
for msg in all_user_msgs:
    cleaned_msg = clean_msg(msg)
    if cleaned_msg:
        texts_to_embed.append(cleaned_msg)

len(texts_to_embed)


14108

In [29]:
BATCH_SIZE = 50 # worst case only set to 35

embeddings = []
for i in tqdm(range(0, len(texts_to_embed), BATCH_SIZE), desc="Embedding batches"):
    batch = texts_to_embed[i:i+BATCH_SIZE]
    try:
        response = client.embeddings.create(input=batch, model=EMBEDDING_MODEL)
        batch_embeddings = [data.embedding for data in response.data]
        embeddings.extend(batch_embeddings)
    except Exception as e:
        print(f"Error embedding batch {i}: {e}")
        break

print(f"Total embeddings: {len(embeddings)}")

Embedding batches: 100%|██████████| 283/283 [03:45<00:00,  1.25it/s]

Total embeddings: 14108





In [64]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5, init='k-means++', random_state=42)
kmeans.fit(embeddings)
labels = kmeans.labels_

text_label_mapping = {i: [] for i in range(5)}
for text, label in zip(texts_to_embed, labels):
    text_label_mapping[label].append(text)

In [65]:
cluster_sizes = {i: len(text_label_mapping[i]) for i in text_label_mapping.keys()}
cluster_sizes

{0: 1770, 1: 5130, 2: 2735, 3: 2917, 4: 1556}

In [78]:
from openai import OpenAI
client = OpenAI()

# Process all clusters in fewer API calls
all_cluster_info = []

for i in sorted(text_label_mapping.keys()):
    examples = text_label_mapping[i][:100]
    examples_text = " | ".join([ex for ex in examples])
    all_cluster_info.append(f"Cluster {i}: {examples_text}")

prompt = f"""I have clustered user messages into groups. For each cluster below, provide a short descriptive name (2-4 words).

{chr(10).join(all_cluster_info)}

Respond in the format:
Cluster 0: [name]
Cluster 1: [name]
etc.
"""

response = client.chat.completions.create(
    model="gpt-5",
    messages=[{"role": "user", "content": prompt}],
)

response.choices[0].message.content

'Cluster 0: RL/Control Coding Help\nCluster 1: STEM Q&A and LaTeX\nCluster 2: Cloud GPU Troubleshooting\nCluster 3: Career/Writing Refinement\nCluster 4: Robotics Research Ideas'

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import numpy as np

embeddings = np.array(embeddings)
tsne = TSNE(n_components=3, random_state=42)
tsne_result = tsne.fit_transform(embeddings)
embeddings_2d = tsne.fit_transform(embeddings)

In [66]:
import pandas as pd

table_data = []

# show examples of each cluster
for label in sorted(text_label_mapping.keys()):
    examples = text_label_mapping[label][:3]
    for i, example in enumerate(examples):
        table_data.append({
            'Cluster': label,
            'Example': example,
        })

df_examples = pd.DataFrame(table_data)

df_examples

Unnamed: 0,Cluster,Example
0,0,"class LinearPolicy(nn.Module):\n def __init__(self, in_dim, out_dim):\n super(LinearPolicy, self).__init__()\n self.Linear = nn.Linear(in_dim, out_dim)\n\n def forward(self, ob..."
1,0,"class LinearPGNetwork(nn.Module):\n def __init__(self, in_dim=4, out_dim=2):\n super().__init__()\n self.linear = nn.Linear(in_dim, out_dim, bias=True)\n\n def forward(self, ob..."
2,0,def print_linear_coeffs(model):\n with torch.no_grad():\n A = model.linear.weight.detach().cpu().numpy()\n b = model.linear.bias.detach().cpu().numpy()\n w = A[1] - A[0]\n c...
3,1,what's the difference between software engineer and forward deployment software engineer
4,1,why swe over fdse?
5,1,"i don't get it, what are arms?"
6,2,how do i write text into the links file
7,2,how do i uninstall tauri from my mac. i downloaded it from my terminal
8,2,Original custom instructions no longer available
9,3,how do i build a personal dashboard that tells me how i've been using ChatGPT
