In [5]:
# Move to repo root
target_folder = "NCEAS_Unsupervised_NLP"
current_dir = os.getcwd()

while os.path.basename(current_dir) != target_folder:
    parent_dir = os.path.abspath(os.path.join(current_dir, ".."))
    if parent_dir == current_dir:
        raise FileNotFoundError(f"{target_folder} not found.")
    current_dir = parent_dir

os.chdir(current_dir)

# Add repo root
sys.path.insert(0, current_dir)

# Add src so custom_packages works
sys.path.insert(0, os.path.join(current_dir, "src"))

In [6]:
# IMPORTS
# Standard Imports
import os
import sys
import re
import warnings
from collections import defaultdict
import numpy as np
import pandas as pd
import phate
import umap
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import adjusted_rand_score, rand_score
from hdbscan import HDBSCAN

from custom_packages.diffusion_condensation import DiffusionCondensation as dc
from custom_packages.fowlkes_mallows import FowlkesMallows

warnings.filterwarnings("ignore")
np.random.seed(42)



In [7]:
import os
import sys
from dotenv import load_dotenv
load_dotenv() 

# Set the target folder name you want to reach
target_folder = "NCEAS_Unsupervised_NLP"

# Get the current working directory
current_dir = os.getcwd()

# Loop to move up the directory tree until we reach the target folder
while os.path.basename(current_dir) != target_folder:
    parent_dir = os.path.abspath(os.path.join(current_dir, ".."))
    if parent_dir == current_dir:
        # If we reach the root directory and haven't found the target, exit
        raise FileNotFoundError(f"{target_folder} not found in the directory tree.")
    current_dir = parent_dir

# Change the working directory to the folder where "phate-for-text" is found
os.chdir(current_dir)

# Add the "phate-for-text" directory to sys.path
sys.path.insert(0, current_dir)

In [8]:
import sys
import os

# Add src to Python path
sys.path.insert(0, os.path.join(os.getcwd(), "src"))


In [9]:
# Move to repo root
target_folder = "NCEAS_Unsupervised_NLP"
current_dir = os.getcwd()

while os.path.basename(current_dir) != target_folder:
    parent_dir = os.path.abspath(os.path.join(current_dir, ".."))
    if parent_dir == current_dir:
        raise FileNotFoundError(f"{target_folder} not found.")
    current_dir = parent_dir

os.chdir(current_dir)

# Add repo root
sys.path.insert(0, current_dir)

# Add src so custom_packages works
sys.path.insert(0, os.path.join(current_dir, "src"))


In [10]:
# ===================
# Standard Libraries
# ===================
import importlib
import os
import re
import warnings
from collections import defaultdict

# ===================
# Data Manipulation
# ===================
import numpy as np
import pandas as pd

# ==========================
# Dimensionality Reduction
# ==========================
import phate
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# ========================
# Clustering
# ========================
from hdbscan import HDBSCAN
from scipy.cluster.hierarchy import fcluster
from sklearn.cluster import AgglomerativeClustering
from custom_packages.diffusion_condensation import DiffusionCondensation as dc

# ======================
# Evaluation Metrics
# ======================
from custom_packages.fowlkes_mallows import FowlkesMallows
from sklearn.metrics import adjusted_rand_score, rand_score

from tqdm import tqdm

np.random.seed(42)
warnings.filterwarnings("ignore")
importlib.reload(phate)


<module 'phate' from '/opt/anaconda3/lib/python3.12/site-packages/phate/__init__.py'>

In [12]:
import pandas as pd
df = pd.read_csv("src/data/arxiv/data/arxiv/arxiv_30k_clean.csv")
df_new = pd.DataFrame()
df_new["topic"] = df["text"]
df_new["category_1"] = df["label"]
df_new["category_0"] = df["label"].apply(lambda x: x.split(".")[0])

df_new = df_new.dropna().reset_index(drop=True)


In [13]:
df_new = df_new.dropna().reset_index(drop=True)

df_new = df_new[
    df_new["topic"].apply(lambda x: isinstance(x, str) and x.strip() != "")
].reset_index(drop=True)


In [14]:
df_new.to_csv("src/data/arxiv/arxiv_clean.csv", index=False)

In [15]:
df_new.shape

(30000, 3)

In [16]:
from sentence_transformers import SentenceTransformer
import torch
from tqdm import tqdm
import numpy as np

def get_embeddings(texts, model_name="Qwen/Qwen3-Embedding-0.6B"):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print("Using device:", device)

    model = SentenceTransformer(model_name, device=device)

    embeddings = model.encode(
        texts,
        batch_size=32,
        show_progress_bar=True,
        convert_to_numpy=True,
        normalize_embeddings=True
    )

    return embeddings


In [17]:
dataset_name = "arxiv"
embedding_model = "Qwen3-Embedding-0.6B"

os.makedirs(f"{embedding_model}_results", exist_ok=True)
os.makedirs("qwen_embeddings", exist_ok=True)

embedding_list = get_embeddings(df_new["topic"].tolist())

np.save(f"qwen_embeddings/{dataset_name}_embed.npy", embedding_list)


Using device: cpu




Loading weights:   0%|          | 0/310 [00:00<?, ?it/s]

Batches:   0%|          | 0/938 [00:00<?, ?it/s]

In [None]:
dataset_name = "arxiv"
embedding_model = "qwen"

os.makedirs("qwen_embeddings", exist_ok=True)

np.save(f"qwen_embeddings/{dataset_name}_{embedding_model}_embed.npy",
        embedding_list)


NameError: name 'embedding_list' is not defined

In [None]:
os.makedirs(f'{embedding_model}_reduced_embeddings', exist_ok=True)

In [None]:
shuffle_idx = np.random.RandomState(seed=42).permutation(len(df_new))
# Shuffle both documents and embeddings using the same index
topic_data = df_new.iloc[shuffle_idx].reset_index(drop=True)
data = np.array(embedding_list)[shuffle_idx] 
reverse_idx = np.argsort(shuffle_idx)

NameError: name 'embedding_list' is not defined

In [None]:
print(topic_data.columns)


Index(['topic', 'category_1', 'category_0'], dtype='object')


In [None]:
topic_dict = {}
for col in topic_data.columns:
    if re.match(r'^category_\d+$', col): 
        unique_count = len(topic_data[col].unique())
        topic_dict[unique_count] = np.array(topic_data[col])

In [None]:
reducer_model = phate.PHATE(n_jobs=-2,random_state=42, n_components=300,decay=20,t="auto",n_pca=None) #{'k':10,'alpha':4,'t':3}
embed_phate = reducer_model.fit_transform(data)
np.save(
    f"{embedding_model}_reduced_embeddings/PHATE_{dataset_name}_embed.npy",
    embed_phate
)


Calculating PHATE...
  Running PHATE on 14824 observations and 3072 variables.
  Calculating graph and diffusion operator...
    Calculating KNN search...
    Calculated KNN search in 65.17 seconds.
    Calculating affinities...
    Calculated affinities in 43.97 seconds.
  Calculated graph and diffusion operator in 109.20 seconds.
  Calculating landmark operator...
    Calculating SVD...
    Calculated SVD in 1.41 seconds.
    Calculating KMeans...
    Calculated KMeans in 2.12 seconds.
  Calculated landmark operator in 4.17 seconds.
  Calculating optimal t...
    Automatically selected t = 33
  Calculated optimal t in 0.76 seconds.
  Calculating diffusion potential...
  Calculated diffusion potential in 0.40 seconds.
  Calculating metric MDS...
  Calculated metric MDS in 154.98 seconds.
Calculated PHATE in 269.62 seconds.


In [None]:
embed_phate = np.load(
    f"{embedding_model}_reduced_embeddings/PHATE_amz_embed.npy"
)


In [None]:
depth= 3
cluster_levels=[]
for i in reversed(range(0, depth)):
    cluster_levels.append(len(topic_data[f'category_{i}'].unique()))

In [None]:
import numpy as np
from sklearn.decomposition import PCA
import umap
import matplotlib.pyplot as plt

include_pca = True
include_umap = True

# Convert embeddings
embeddings = np.array(data)

embedding_methods = {}

# =====================
# PHATE (already computed)
# =====================
embedding_methods["PHATE"] = embed_phate

np.save(
    f"{embedding_model}_reduced_embeddings/PHATE_{dataset_name}_embed.npy",
    embedding_methods["PHATE"]
)

# =====================
# PCA
# =====================
if include_pca:
    pca = PCA(n_components=300, random_state=42)
    embedding_methods["PCA"] = pca.fit_transform(embeddings)

    np.save(
        f"{embedding_model}_reduced_embeddings/PCA_{dataset_name}_embed.npy",
        embedding_methods["PCA"]
    )

# =====================
# UMAP
# =====================
if include_umap:
    umap_model = umap.UMAP(
        n_components=300,
        random_state=42,
        min_dist=0.05,
        n_neighbors=10
    )

    embedding_methods["UMAP"] = umap_model.fit_transform(embeddings)

    np.save(
        f"{embedding_model}_reduced_embeddings/UMAP_{dataset_name}_embed.npy",
        embedding_methods["UMAP"]
    )

# =====================
# Optional: t-SNE
# =====================
# from sklearn.manifold import TSNE
# tsne_model = TSNE(n_components=3, random_state=42)
# embedding_methods["tSNE"] = tsne_model.fit_transform(embeddings)
# np.save(
#     f"{embedding_model}_reduced_embeddings/tSNE_{dataset_name}_embed.npy",
#     embedding_methods["tSNE"]
# )


In [None]:
import hdbscan
scores_all = defaultdict(lambda: defaultdict(list))

for embed_name, embed_data in tqdm(embedding_methods.items()):
    for cluster_method in ["Agglomerative", "HDBSCAN", "DC"]:
        for level in cluster_levels:

            # -----------------
            # Clustering
            # -----------------
            if cluster_method == "Agglomerative":
                model = AgglomerativeClustering(n_clusters=level)
                model.fit(embed_data)
                labels = model.labels_

            elif cluster_method == "HDBSCAN":
                model = HDBSCAN(min_cluster_size=level)
                model.fit(embed_data)

                Z = model.single_linkage_tree_.to_numpy()
                labels = fcluster(Z, level, criterion='maxclust')
                labels[labels == -1] = labels.max() + 1

            elif cluster_method == "DC":
                model = dc(min_clusters=level, max_iterations=5000, k=10, alpha=3)
                model.fit(embed_data)
                labels = model.labels_

            # -----------------
            # Match ground truth
            # -----------------
            available_levels = np.array(sorted(topic_dict.keys()))
            closest_level = min(available_levels, key=lambda k: abs(k - level))

            topic_series = topic_dict[closest_level]
            valid_idx = ~pd.isna(topic_series)

            target_lst = topic_series[valid_idx]
            label_lst = labels[valid_idx]

            # -----------------
            # Metrics
            # -----------------
            try:
                fm_score = FowlkesMallows.Bk(
                    {level: target_lst},
                    {level: label_lst}
                )[level]["FM"]
            except:
                fm_score = np.nan

            scores_all[(embed_name, cluster_method)]["FM"].append(fm_score)
            scores_all[(embed_name, cluster_method)]["Rand"].append(
                rand_score(target_lst, label_lst)
            )
            scores_all[(embed_name, cluster_method)]["ARI"].append(
                adjusted_rand_score(target_lst, label_lst)
            )


100%|██████████| 3/3 [39:28<00:00, 789.65s/it]


In [None]:
rows = []

for (embed_name, cluster_method), score_dict in scores_all.items():
    n_levels = len(score_dict["FM"])

    for i in range(n_levels):
        rows.append({
            "reduction_method": embed_name,
            "cluster_method": cluster_method,
            "level": cluster_levels[i],
            "FM": score_dict["FM"][i],
            "Rand": score_dict["Rand"][i],
            "ARI": score_dict["ARI"][i],
        })

scores_df = pd.DataFrame(rows)

scores_df = scores_df.sort_values(
    by=["reduction_method", "cluster_method", "level"]
).reset_index(drop=True)

write_header = not os.path.exists(
    f"{embedding_model}_results/other_{dataset_name}_results.csv"
)

scores_df.to_csv(
    f"{embedding_model}_results/other_{dataset_name}_results.csv",
    mode="a",
    index=False,
    header=write_header
)


In [None]:
import json
with open("combo_color_map.json", 'r') as file:
        combo_color_map = json.load(file)

In [None]:
import matplotlib.pyplot as plt

metrics = ['FM', 'Rand', 'ARI']

for metric in metrics:
    plt.figure(figsize=(10, 6))
    for (embed_name, method), metric_scores in scores_all.items():
        if method=="DC":
            method="Diffusion Condensation"
        combo_key = f"{embed_name}_{method}"
        plt.plot(
            cluster_levels, 
            metric_scores[metric], 
            marker='o', 
            label=f"{embed_name} {method}",
            color= combo_color_map.get(combo_key, 'black')
        )
    
    plt.title(f"{metric} Score Across Cluster Levels")
    plt.xlabel("Cluster Level")
    plt.ylabel(metric)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()