In [1]:
# 📦 Setup & Imports
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from umap.umap_ import UMAP
import hdbscan
from dotenv import load_dotenv
from tqdm import tqdm
import chromadb
from chromadb.config import Settings
import openai

# 🔑 Load API keys
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

# 📁 Constants
USER_ID = "2"
OUTPUT_DIR = f"user_knowledge/user_{USER_ID}"


In [2]:

# 🧠 Connect to ChromaDB
chroma_client = chromadb.Client(Settings(persist_directory=OUTPUT_DIR))
collection = chroma_client.get_or_create_collection(name="calendar_events")

# 📥 Load all documents from Chroma
retrieved = collection.get(include=["embeddings", "documents", "metadatas"])
print(f"✅ Loaded {len(retrieved['documents'])} documents from ChromaDB")

docs = retrieved["documents"]
embeddings = np.array(retrieved["embeddings"])
metadatas = retrieved["metadatas"]
df = pd.DataFrame(metadatas)


✅ Loaded 0 documents from ChromaDB


In [3]:

# 🌈 UMAP Dimensionality Reduction
umap_model = UMAP(n_components=2, random_state=42)
umap_embeds = umap_model.fit_transform(embeddings)

# 🔍 HDBSCAN Clustering
clusterer = hdbscan.HDBSCAN(min_cluster_size=5)
labels = clusterer.fit_predict(umap_embeds)
df["cluster"] = labels

# 📊 Cluster Visualization
plt.figure(figsize=(10, 7))
palette = sns.color_palette("hsv", len(set(labels)))
sns.scatterplot(x=umap_embeds[:, 0], y=umap_embeds[:, 1],
                hue=labels, palette=palette, legend="full")
plt.title("Calendar Event Clusters (UMAP + HDBSCAN)")
plt.xlabel("UMAP-1")
plt.ylabel("UMAP-2")
plt.show()


ValueError: Expected 2D array, got 1D array instead:
array=[].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [None]:

# 🔎 Semantic Search
def get_query_embedding(text):
    response = openai.embeddings.create(
        input=[text],
        model="text-embedding-ada-002"
    )
    return np.array(response.data[0].embedding)

def semantic_search(query, top_k=5):
    query_vector = get_query_embedding(query)
    sims = cosine_similarity([query_vector], embeddings)[0]
    top_idxs = np.argsort(sims)[-top_k:][::-1]

    return [{
        "score": float(sims[i]),
        "text": docs[i],
        "cluster": df.iloc[i]["cluster"],
        "meta": df.iloc[i].to_dict()
    } for i in top_idxs]


In [None]:

# 🧪 Try searching
query = "team meeting about project delivery"
results = semantic_search(query)

for res in results:
    print(f"\n📌 [Cluster {res['cluster']}] Score: {res['score']:.2f}")
    print(f"📝 {res['text']}")


✅ Loaded 365 documents and 0 embeddings
