In [None]:
import umap
import json
import joblib
import pandas as pd
import numpy as np
import cupy as cp
from google.colab import drive
from matplotlib import pyplot as plt
from IPython.display import display
from sklearn.feature_extraction.text import TfidfVectorizer
from cuml import UMAP, KMeans, DBSCAN
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sentence_transformers import SentenceTransformer
from collections import defaultdict

In [None]:
drive.mount('/content/drive')

Clustering and visualization


In [None]:
# Parameters
TFIDF_PATH = "/content/drive/MyDrive/MSE 641 Project Data/tfidf_matrix_fast.pkl"
UMAP_DIM = 30
UMAP_N_NEIGHBORS = 30
SVD_COMPONENTS = 200
K_RANGE = [5, 10, 20]
DBSCAN_EPS = 1.2
DBSCAN_MIN_SAMPLES = 15
OUTPUT_DIR = "/content/drive/MyDrive/MSE 641 Project Data/"

# Load & Preprocess
X = joblib.load(TFIDF_PATH)
svd = TruncatedSVD(n_components=SVD_COMPONENTS, random_state=42)
X_svd = svd.fit_transform(X)
norms = np.linalg.norm(X_svd, axis=1)
X_svd = X_svd[norms > 0]
X_svd, _ = np.unique(X_svd, axis=0, return_index=True)

# UMAP Dimensionality Reduction
X_gpu = cp.asarray(X_svd)
umap = UMAP(
  n_components=UMAP_DIM,
  random_state=42,
  n_neighbors=UMAP_N_NEIGHBORS,
  min_dist=0.1
)
X_umap = cp.asnumpy(umap.fit_transform(X_gpu))

# Project to 2D for plotting 
if UMAP_DIM > 2:
    proj2d = TruncatedSVD(n_components=2, random_state=42)
    X_2d = proj2d.fit_transform(X_umap)
else:
    X_2d = X_umap

# Save the 2D projection
joblib.dump(X_2d, OUTPUT_DIR + "umap_2d.pkl")
print("Saved 2D projection for visualization.")

# Subplot for all k
fig, axes = plt.subplots(1, len(K_RANGE), figsize=(20, 6), sharex=True, sharey=True)
if len(K_RANGE) == 1:
    axes = [axes]

for i, K in enumerate(K_RANGE):
  # K-Means clustering
  kmeans = KMeans(n_clusters=K, random_state=42, n_init="auto")
  k_labels = kmeans.fit_predict(X_umap)

  # Save K-Means labels
  joblib.dump(k_labels, OUTPUT_DIR + f"kmeans_labels_k{K}.pkl")

  # Cluster evaluation metrics
  if len(np.unique(k_labels)) > 1:
    sil = silhouette_score(X_umap, k_labels)
    ch = calinski_harabasz_score(X_umap, k_labels)
    db = davies_bouldin_score(X_umap, k_labels)
  else:
    sil, ch, db = np.nan, np.nan, np.nan

  print(f"\n==== Results for k={K} ====")
  print(f"Silhouette Score: {sil:.4f}")
  print(f"Calinski-Harabasz Index: {ch:.1f}")
  print(f"Davies-Bouldin Index: {db:.4f}")
  print(f"KMeans cluster sizes: {np.bincount(k_labels)}")

  # DBSCAN inside each KMeans cluster to find noise
  noise_indices = []
  for cluster_id in np.unique(k_labels):
    mask = (k_labels == cluster_id)
    X_cluster = X_umap[mask]
    idx_cluster = np.where(mask)[0]
    if len(X_cluster) < DBSCAN_MIN_SAMPLES:
        continue
    dbscan = DBSCAN(eps=DBSCAN_EPS, min_samples=DBSCAN_MIN_SAMPLES)
    db_labels = dbscan.fit_predict(X_cluster)
    noise_in_cluster = idx_cluster[db_labels == -1]
    noise_indices.extend(noise_in_cluster.tolist())

  print(f"DBSCAN detected noise points: {len(noise_indices)}")

  # Save DBSCAN noise indices
  joblib.dump(noise_indices, OUTPUT_DIR + f"dbscan_noise_indices_k{K}.pkl")

  # Plot overlay
  ax = axes[i]
  # main clustering group
  sc = ax.scatter(
    X_2d[:, 0], X_2d[:, 1],
    c=k_labels, cmap="tab20",
    s=10, alpha=0.75,
    label="KMeans clusters"
  )
  # noise points
  if noise_indices:
    ax.scatter(
      X_2d[noise_indices, 0],
      X_2d[noise_indices, 1],
      c="k", marker=".", s=8,
      label="DBSCAN noise", alpha=0.25
      )
  ax.set_title(f"k={K}")
  ax.set_xlabel("Component 1")
  if i == 0:
    ax.set_ylabel("Component 2")
  ax.grid(True, linestyle="--", alpha=0.3)
  if i == len(K_RANGE) - 1:
    ax.legend(markerscale=1.5)

plt.suptitle("KMeans with DBSCAN Noise Overlay (all k)")
plt.tight_layout(rect=[0, 0.03, 1, 0.95])

# Save the combined plot
plt.savefig(OUTPUT_DIR + "kmeans_dbscan_subplots.png", dpi=180)
plt.show()

print("\nAll done.")


clustering result example for k = 5, 10, and 20

In [None]:
TOP_N = 10
K_RANGE = [5, 10, 20]

df = pd.read_json("/content/drive/MyDrive/MSE 641 Project Data/wildchat_en_cleaned.jsonl", lines=True)

for K_VAL in K_RANGE:
    print(f"\n===== Top {TOP_N} Examples per Cluster for k = {K_VAL} =====\n")
    # Load cluster labels and noise indices for this k
    k_labels = joblib.load(OUTPUT_DIR + f"kmeans_labels_k{K_VAL}.pkl")
    dbscan_noise_indices = joblib.load(OUTPUT_DIR + f"dbscan_noise_indices_k{K_VAL}.pkl")
    df['cluster'] = k_labels
    df['dbscan_noise'] = False
    df.loc[dbscan_noise_indices, 'dbscan_noise'] = True

    for cluster_id, group in df[df['dbscan_noise'] == False].groupby('cluster'):
        count = len(group)
        print(f"\n--- Cluster {cluster_id} ({count} samples) ---")
        for idx, row in enumerate(group.head(TOP_N).itertuples(), 1):
            prompt = row.prompt
            response = row.response
            short_prompt = (prompt[:100] + "...") if len(prompt) > 100 else prompt
            short_response = (response[:100] + "...") if len(response) > 100 else response
            print(f"{idx}. Prompt: {short_prompt}")
            print(f"   Response: {short_response}\n")
