In [None]:
import umap
import json
import joblib
import pandas as pd
import numpy as np
import cupy as cp
from google.colab import drive
from matplotlib import pyplot as plt
from IPython.display import display
from sklearn.feature_extraction.text import TfidfVectorizer
from cuml import UMAP, KMeans, DBSCAN
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sentence_transformers import SentenceTransformer
from collections import defaultdict

Clustering and visualization

In [None]:
# load data
in_path = "/content/drive/MyDrive/MSE 641 Project Data/wildchat_en_cleaned.jsonl"
prompts = []
responses = []

with open(in_path, "r", encoding="utf-8") as f:
  for line in f:
    obj = json.loads(line)
    if "prompt" in obj and "response" in obj:
      prompts.append(obj["prompt"])
      responses.append(obj["response"])
texts = [p + " " + r for p, r in zip(prompts, responses)]

# BERT embedding
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(texts, batch_size=64, show_progress_bar=True)

# SVD first to 100
SVD_DIM = 100
svd = TruncatedSVD(n_components=SVD_DIM, random_state=42)
X_svd = svd.fit_transform(embeddings)

# UMAP to 30
umap_neighbors = [30]
umap_dims = [30]
k_range = [5, 10, 20]
dbscan_eps = 1.2
dbscan_min_samples = 15

results = []
best_k_results = {}

for umap_dim in umap_dims:
  for umap_nn in umap_neighbors:
    umap_model = umap.UMAP(n_components=umap_dim, n_neighbors=umap_nn, random_state=42)
    X_umap = umap_model.fit_transform(X_svd)

    for k in k_range:
      kmeans = KMeans(n_clusters=k, random_state=42)
      kmeans_labels = kmeans.fit_predict(X_umap)

      final_labels = np.full(len(X_umap), -1)
      for i in range(k):
        idx = (kmeans_labels == i)
        if np.sum(idx) == 0:
            continue
        subcluster = X_umap[idx]
        dbscan = DBSCAN(eps=dbscan_eps, min_samples=dbscan_min_samples)
        sub_labels = dbscan.fit_predict(subcluster)
        for j, sub_label in enumerate(sub_labels):
            global_idx = np.where(idx)[0][j]
            if sub_label >= 0:
                final_labels[global_idx] = i * 1000 + sub_label
            else:
                final_labels[global_idx] = -1

      valid = final_labels >= 0
      X_valid = X_umap[valid]
      labels_valid = final_labels[valid]
      if len(np.unique(labels_valid)) > 1:
        sil = silhouette_score(X_valid, labels_valid)
        ch = calinski_harabasz_score(X_valid, labels_valid)
        db = davies_bouldin_score(X_valid, labels_valid)
      else:
        sil, ch, db = float('nan'), float('nan'), float('nan')

      results.append({
        'umap_dim': umap_dim,
        'umap_nn': umap_nn,
        'k': k,
        'dbscan_eps': dbscan_eps,
        'dbscan_min_samples': dbscan_min_samples,
        'silhouette': sil,
        'calinski_harabasz': ch,
        'davies_bouldin': db,
        'n_clusters': len(np.unique(labels_valid)),
      })

      if k not in best_k_results or sil > best_k_results[k]["silhouette"]:
        best_k_results[k] = {
          "silhouette": sil,
          "dim": umap_dim,
          "nn": umap_nn,
          "k": k,
          "labels": final_labels,
          "X_umap": X_umap
          }

# Save and Plot for Best Per-k Results
for k, result in best_k_results.items():
  X_umap = result["X_umap"]
  labels = result["labels"]
  dim = result["dim"]
  nn = result["nn"]

  joblib.dump(X_umap, f"/content/drive/MyDrive/MSE 641 Project Data/bert_umap_matrix_k{k}_best.pkl")
  joblib.dump(labels, f"/content/drive/MyDrive/MSE 641 Project Data/bert_kmeans_labels_k{k}_best.pkl")

  # visualization
  if X_umap.shape[1] > 2:
    svd2d = TruncatedSVD(n_components=2, random_state=42)
    X_2d = svd2d.fit_transform(X_umap)
  else:
    X_2d = X_umap

  plt.figure(figsize=(10, 6))
  scatter = plt.scatter(X_2d[:, 0], X_2d[:, 1], c=labels, cmap='tab20', s=10, alpha=0.7)
  plt.title(f"BERT + SVD({SVD_DIM}) + UMAP({dim},{nn}) + KMeans(k={k}) + DBSCAN(1.2,15)")
  plt.xlabel("Component 1")
  plt.ylabel("Component 2")
  plt.grid(True, linestyle='--', alpha=0.3)
  plt.colorbar(scatter, label="Cluster ID")
  plt.tight_layout()
  plt.show()
  plt.savefig(f"/content/drive/MyDrive/MSE 641 Project Data/bert_kmeans_best_k{k}_plot.png")
  plt.close()

# Save evaluation summary
summary_df = pd.DataFrame(results)
summary_path = "/content/drive/MyDrive/MSE 641 Project Data/bert_kmeans_dbscan_metrics_best_per_k.csv"
summary_df.to_csv(summary_path, index=False)


Clustering result example

In [None]:
# Compute & display metrics from previously saved files
import os, re, glob
import numpy as np
import pandas as pd
import joblib
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

OUT_DIR = "/content/drive/MyDrive/MSE 641 Project Data"

label_files = sorted(glob.glob(os.path.join(OUT_DIR, "bert_kmeans_labels_k*_best.pkl")))
if not label_files:
  raise FileNotFoundError(f"No saved label files like 'bert_kmeans_labels_k*_best.pkl' found in {OUT_DIR}")

rows = []
for lf in label_files:
  fname = os.path.basename(lf)
  m = re.search(r"k(\d+)", fname)
  if not m:
    print(f"[skip] cannot parse k from: {fname}")
    continue
  k = int(m.group(1))

  umap_path = os.path.join(OUT_DIR, f"bert_umap_matrix_k{k}_best.pkl")
  if not os.path.exists(umap_path):
    print(f"[skip] UMAP matrix for k={k} not found: {umap_path}")
    continue

  # load data
  X_umap = joblib.load(umap_path)
  labels = joblib.load(lf)

  # calculate evaluation metrics
  mask = labels >= 0
  X_valid = X_umap[mask]
  labels_valid = labels[mask]
  if X_valid.shape[0] > 0 and np.unique(labels_valid).size > 1:
    sil = silhouette_score(X_valid, labels_valid)
    ch = calinski_harabasz_score(X_valid, labels_valid)
    db = davies_bouldin_score(X_valid, labels_valid)
  else:
    sil = ch = db = np.nan

  print(f"\n==== Metrics for k={k} (valid points: {mask.sum()}/{len(labels)}) ====")
  print(f"Silhouette Score: {sil:.4f}" if np.isfinite(sil) else "Silhouette Score: NaN")
  print(f"Calinski-Harabasz Index: {ch:.1f}"  if np.isfinite(ch)  else "Calinski-Harabasz Index: NaN")
  print(f"Davies-Bouldin Index: {db:.4f}" if np.isfinite(db) else "Davies-Bouldin Index: NaN")

  rows.append({
    "k": k,
    "silhouette": sil,
    "calinski_harabasz": ch,
    "davies_bouldin": db,
    "valid_n": int(mask.sum())
  })

# display
if rows:
  df = pd.DataFrame(rows).sort_values("k")
  print("\nSummary:")
  print(df.to_string(index=False))


clustering result example

In [None]:
# BERT clustering result
import pandas as pd
df_clu = pd.DataFrame({
  'prompt': prompts,
  'response': responses,
  'cluster': labels
})

TOP_N = 10

for clu_id, group in df_clu[df_clu['cluster'] != -1].groupby('cluster'):
  print(f"\n--- Cluster {clu_id} ({len(group)} samples) ---")
  for idx, row in enumerate(group.head(TOP_N).itertuples(), 1):
    short_prompt = (row.prompt[:100] + "...") if len(row.prompt) > 100 else row.prompt
    short_response = (row.response[:100] + "...") if len(row.response) > 100 else row.response
    print(f"{idx}. Prompt: {short_prompt}")
    print(f"   Response: {short_response}\n")

# noise points
noise = df_clu[df_clu['cluster'] == -1]
if not noise.empty:
  print(f"\n--- Noise (DBSCAN label=-1), {len(noise)} samples ---")
  for idx, row in enumerate(noise.head(TOP_N).itertuples(), 1):
    short_prompt = (row.prompt[:100] + "...") if len(row.prompt) > 100 else row.prompt
    short_response = (row.response[:100] + "...") if len(row.response) > 100 else row.response
    print(f"{idx}. Prompt: {short_prompt}")
    print(f"Response: {short_response}\n")
