In [None]:
import umap
import json
import joblib
import pandas as pd
import numpy as np
import cupy as cp
from google.colab import drive
from matplotlib import pyplot as plt
from IPython.display import display
from sklearn.feature_extraction.text import TfidfVectorizer
from cuml import UMAP, KMeans, DBSCAN
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sentence_transformers import SentenceTransformer
from collections import defaultdict

Clustering and visualization

In [None]:
# Load data
in_path = "/content/drive/MyDrive/MSE 641 Project Data/wildchat_work_only.jsonl"

prompts = []
responses = []
texts = []

with open(in_path, "r", encoding="utf-8") as f:
  for line in f:
    obj = json.loads(line)
    if "conversation" in obj:
      conv = obj["conversation"]
      if "Prompt:" in conv and "Response:" in conv:
        prompt_part = conv.split("Prompt:")[1].split("Response:")[0].strip()
        response_part = conv.split("Response:")[1].strip()
        prompts.append(prompt_part)
        responses.append(response_part)
        texts.append(prompt_part + " " + response_part)

# BERT embedding
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(texts, batch_size=64, show_progress_bar=True)

SVD_DIM = 100
svd = TruncatedSVD(n_components=SVD_DIM, random_state=42)
X_svd = svd.fit_transform(embeddings)

# Parameters
umap_neighbors = [30]
umap_dims = [30]
k_range = [5, 10, 20]
dbscan_eps = 1.2
dbscan_min_samples = 15

results = []
best_k_results = {}

for umap_dim in umap_dims:
  for umap_nn in umap_neighbors:
    umap_model = umap.UMAP(n_components=umap_dim, n_neighbors=umap_nn, random_state=42)
    X_umap = umap_model.fit_transform(X_svd)

    for k in k_range:
      kmeans = KMeans(n_clusters=k, random_state=42)
      kmeans_labels = kmeans.fit_predict(X_umap)

      final_labels = np.full(len(X_umap), -1)
      for i in range(k):
        idx = (kmeans_labels == i)
        if np.sum(idx) == 0:
          continue
        subcluster = X_umap[idx]
        dbscan = DBSCAN(eps=dbscan_eps, min_samples=dbscan_min_samples)
        sub_labels = dbscan.fit_predict(subcluster)
        for j, sub_label in enumerate(sub_labels):
          global_idx = np.where(idx)[0][j]
          if sub_label >= 0:
            final_labels[global_idx] = i * 1000 + sub_label
          else:
            final_labels[global_idx] = -1

      valid = final_labels >= 0
      X_valid = X_umap[valid]
      labels_valid = final_labels[valid]
      if len(np.unique(labels_valid)) > 1:
        sil = silhouette_score(X_valid, labels_valid)
        ch = calinski_harabasz_score(X_valid, labels_valid)
        db = davies_bouldin_score(X_valid, labels_valid)
      else:
        sil, ch, db = float('nan'), float('nan'), float('nan')

      results.append({
        'umap_dim': umap_dim,
        'umap_nn': umap_nn,
        'k': k,
        'dbscan_eps': dbscan_eps,
        'dbscan_min_samples': dbscan_min_samples,
        'silhouette': sil,
        'calinski_harabasz': ch,
        'davies_bouldin': db,
        'n_clusters': len(np.unique(labels_valid)),
      })

      if k not in best_k_results or sil > best_k_results[k]["silhouette"]:
        best_k_results[k] = {
          "silhouette": sil,
          "dim": umap_dim,
          "nn": umap_nn,
          "k": k,
          "labels": final_labels,
          "X_umap": X_umap
        }

# Save and Plot for Best Per-k Results
for k, result in best_k_results.items():
  X_umap = result["X_umap"]
  labels = result["labels"]
  dim = result["dim"]
  nn = result["nn"]

  joblib.dump(X_umap, f"/content/drive/MyDrive/MSE 641 Project Data/bert_umap_matrix_k{k}_best_work.pkl")
  joblib.dump(labels, f"/content/drive/MyDrive/MSE 641 Project Data/bert_kmeans_labels_k{k}_best_work.pkl")

  if X_umap.shape[1] > 2:
    svd = TruncatedSVD(n_components=2, random_state=42)
    X_2d = svd.fit_transform(X_umap)
  else:
    X_2d = X_umap

  plt.figure(figsize=(10, 6))
  scatter = plt.scatter(X_2d[:, 0], X_2d[:, 1], c=labels, cmap='tab20', s=10, alpha=0.7)
  plt.title(f"BERT + UMAP({dim},{nn}) + KMeans(k={k}) + DBSCAN(1.2,15)")
  plt.xlabel("Component 1")
  plt.ylabel("Component 2")
  plt.grid(True, linestyle='--', alpha=0.3)
  plt.colorbar(scatter, label="Cluster ID")
  plt.tight_layout()
  plt.show()
  plt.savefig(f"/content/drive/MyDrive/MSE 641 Project Data/bert_kmeans_best_k{k}_plot_work.png")
  plt.close()

# Save evaluation summary
summary_df = pd.DataFrame(results)
summary_path = "/content/drive/MyDrive/MSE 641 Project Data/bert_kmeans_dbscan_metrics_best_per_k_work.csv"
summary_df.to_csv(summary_path, index=False)


Clustering result example

In [None]:
import random
import pandas as pd

# Display clustering results with  10 sample texts for each cluster in best_k_results
num_samples_per_cluster = 10  

for k, result in best_k_results.items():
  print(f"\n--- Results for k = {k} ---")
  labels = result["labels"]
  X_umap = result["X_umap"]
  cluster_ids = set(labels)
  cluster_ids.discard(-1)

  data = pd.DataFrame({
    "label": labels,
    "text": texts
  })

  for cluster in sorted(cluster_ids):
    cluster_texts = data[data["label"] == cluster]["text"].tolist()
    print(f"\nCluster {cluster} (size={len(cluster_texts)}):")
    if len(cluster_texts) > num_samples_per_cluster:
      sample_texts = random.sample(cluster_texts, num_samples_per_cluster)
    else:
      sample_texts = cluster_texts

    for i, text in enumerate(sample_texts):
      snippet = text[:300].replace('\n', ' ')
      print(f"Sample {i+1}: {snippet}...")
      