In [1]:
import os
import sys

# Hardcoded to my CMSE_495 directory structure
repo_root = "/home/raosidha/CMSE_495"

os.chdir(repo_root)
sys.path.insert(0, repo_root)
sys.path.insert(0, os.path.join(repo_root, "src"))

print(f"Directory set to: {os.getcwd()}")

Directory set to: /home/raosidha/CMSE_495


In [2]:
import numpy as np
import pandas as pd
import phate, umap
from scipy.cluster.hierarchy import fcluster
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import adjusted_rand_score, rand_score
from hdbscan import HDBSCAN
from custom_packages.diffusion_condensation import DiffusionCondensation as dc
from custom_packages.fowlkes_mallows import FowlkesMallows

# Using the paths that worked in your environment
metadata_path = "src/data/rcv1_v2/src_2/data_2/rcv1_v2/rcv1_qwen_metadata.csv"
embeddings_path = "src/data/rcv1_v2/src_2/data_2/rcv1_v2/rcv1_qwen_embeddings.npy"

df = pd.read_csv(metadata_path)
data = np.load(embeddings_path)
print(f"Success! Loaded {data.shape[0]} documents.")

Success! Loaded 5000 documents.


In [3]:
print("Running PCA, UMAP, and PHATE...")
reductions = {
    "PCA": PCA(n_components=50, random_state=42).fit_transform(data),
    "UMAP": umap.UMAP(n_components=2, random_state=42).fit_transform(data),
    "PHATE": phate.PHATE(n_components=3, random_state=42).fit_transform(data)
}

results = []
levels = ["category 0", "category 1", "category 2"]

print("Starting clustering suite...")
for red_name, red_data in reductions.items():
    print(f"  Processing {red_name}...")
    for method in ["Agglomerative", "HDBSCAN", "DC"]:
        for i, col in enumerate(levels):
            # Target number of clusters for this hierarchy level
            n_clusters = len(df[col].unique())
            
            if method == "Agglomerative":
                model = AgglomerativeClustering(n_clusters=n_clusters)
                lbls = model.fit_predict(red_data)
                
            elif method == "HDBSCAN":
                # Aligning with ArXiv logic to get exactly n_clusters
                model = HDBSCAN(min_cluster_size=5)
                model.fit(red_data)
                Z = model.single_linkage_tree_.to_numpy()
                lbls = fcluster(Z, n_clusters, criterion='maxclust')
                # Handle noise points if any
                lbls[lbls == -1] = lbls.max() + 1
                
            elif method == "DC":
                # FIX: Separate fit from label access to avoid 2D array error
                model_dc = dc(
                    min_clusters=n_clusters, 
                    max_iterations=5000, 
                    k=10, 
                    alpha=3
                )
                model_dc.fit(red_data)
                lbls = model_dc.labels_ 
            
            # Metrics Calculation
            target = df[col]
            results.append({
                "reduction_method": red_name,
                "cluster_method": method,
                "level": i,
                "ARI": adjusted_rand_score(target, lbls),
                "Rand": rand_score(target, lbls),
                "FM": FowlkesMallows.Bk({i: target}, {i: lbls})[i]["FM"]
            })

# Save Final Results
os.makedirs("Qwen3-Embedding-0.6B_results", exist_ok=True)
scores_df = pd.DataFrame(results)
scores_df.to_csv("Qwen3-Embedding-0.6B_results/rcv1_results.csv", index=False)
print("Final Release Results saved successfully!")

Running PCA, UMAP, and PHATE...
Calculating PHATE...
  Running PHATE on 5000 observations and 1024 variables.
  Calculating graph and diffusion operator...
    Calculating PCA...
    Calculated PCA in 1.63 seconds.
    Calculating KNN search...
    Calculated KNN search in 4.89 seconds.
    Calculating affinities...
    Calculated affinities in 1.73 seconds.
  Calculated graph and diffusion operator in 8.26 seconds.
  Calculating landmark operator...
    Calculating SVD...
    Calculated SVD in 3.97 seconds.
    Calculating KMeans...
    Calculated KMeans in 17.00 seconds.
  Calculated landmark operator in 20.96 seconds.
  Calculating optimal t...
    Automatically selected t = 35
  Calculated optimal t in 20.21 seconds.
  Calculating diffusion potential...
  Calculated diffusion potential in 1.58 seconds.
  Calculating metric MDS...
    SGD-MDS may not have converged: stress changed by 1.2% in final iterations. Consider increasing n_iter or adjusting learning_rate.
  Calculated metric

reduction_method,cluster_method,level,ARI,Rand,FM

PCA,Agglomerative,0,0.1716996411770845,0.8314942188437687,0.32100305892801256

PCA,Agglomerative,1,0.1716996411770845,0.8314942188437687,0.32100305892801256

PCA,Agglomerative,2,0.1716996411770845,0.8314942188437687,0.32100305892801256

PCA,HDBSCAN,0,0.009609517143094356,0.21238095619123826,0.43573385155534045

PCA,HDBSCAN,1,0.009609517143094356,0.21238095619123826,0.43573385155534045

PCA,HDBSCAN,2,0.009609517143094356,0.21238095619123826,0.43573385155534045

PCA,DC,0,0.12649173269201136,0.7885168233646729,0.24763318814349114

PCA,DC,1,0.12649173269201136,0.7885168233646729,0.24763318814349114

PCA,DC,2,0.12649173269201136,0.7885168233646729,0.24763318814349114

Our initial benchmark on the RCV1 dataset using PCA and Qwen3-0.6B embeddings shows that Agglomerative Clustering is the most effective current method with an ARI of 0.17. We observed identical scores across hierarchy levels because the specific data subset used contains documents where the root and leaf categories are identical. Future work will focus on using PHATE (which is still running) to see if it better separates these topics than PCA.