In [1]:
import os
import sys
from dotenv import load_dotenv
load_dotenv() 

# Set the target folder name you want to reach
target_folder = "phate-for-text"

# Get the current working directory
current_dir = os.getcwd()

# Loop to move up the directory tree until we reach the target folder
while os.path.basename(current_dir) != target_folder:
    parent_dir = os.path.abspath(os.path.join(current_dir, ".."))
    if parent_dir == current_dir:
        # If we reach the root directory and haven't found the target, exit
        raise FileNotFoundError(f"{target_folder} not found in the directory tree.")
    current_dir = parent_dir

# Change the working directory to the folder where "phate-for-text" is found
os.chdir(current_dir)

# Add the "phate-for-text" directory to sys.path
sys.path.insert(0, current_dir)

In [2]:
# ===================
# Standard Libraries
# ===================
import importlib
import os
import re
import warnings
from collections import defaultdict
import gc

# ===================
# Data Manipulation
# ===================
import numpy as np
import pandas as pd

# ==========================
# Dimensionality Reduction
# ==========================
import phate
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap

# ========================
# Clustering
# ========================
from hdbscan import HDBSCAN
import hdbscan
from scipy.cluster.hierarchy import fcluster
from sklearn.cluster import AgglomerativeClustering
from custom_packages.diffusion_condensation import DiffusionCondensation as dc

# ======================
# Evaluation Metrics
# ======================
from custom_packages.fowlkes_mallows import FowlkesMallows
from sklearn.metrics import adjusted_rand_score, rand_score


from tqdm import tqdm
# ==============
# Global Config
# ==============
np.random.seed(42)
warnings.filterwarnings("ignore")

# Reload modules if needed
importlib.reload(phate)
from openai import OpenAI
key = os.getenv('GPT_API_KEY')

client = OpenAI(api_key=key)

In [4]:
df = pd.read_excel('data/WebOfScience/Meta-data/Data.xlsx')

In [5]:
df.head(1)

Unnamed: 0,Y1,Y2,Y,Domain,area,keywords,Abstract
0,0,12,12,CS,Symbolic computation,(2+1)-dimensional non-linear optical waves; e...,(2 + 1)-dimensional non-linear optical waves t...


In [6]:


new=[]
for i,row in df.iterrows():
    # for phrase in row['keywords'].split(';'):
    result={}
    result['topic']=str(row['keywords'])
    result['category 0'] = row['Domain']
    result['category 1'] = row['area']
    # result['category 2'] = i
    new.append(result)


In [7]:
df_new = pd.DataFrame(new)

In [8]:
df_new

Unnamed: 0,topic,category 0,category 1
0,(2+1)-dimensional non-linear optical waves; e...,CS,Symbolic computation
1,Aging; Tau; Amyloid; PET; Alzheimer's disease...,Medical,Alzheimer's Disease
2,LED lighting system; PV system; Distributed l...,Civil,Green Building
3,NdFeB magnets; Electric motor; Electric vehic...,ECE,Electric motor
4,Parkinson's disease; dyskinesia; adenosine A(...,Medical,Parkinson's Disease
...,...,...,...
46980,Karate; Verletzungsrisiko; Sportverletzung; P...,Medical,Sports Injuries
46981,Z-Wave; Wireless; Embedded systems; Internet ...,CS,Data structures
46982,Antifouling biosensor; Peptide; Electrochemis...,Medical,Cancer
46983,High Performance Computing; Parallel Computin...,CS,Distributed computing


In [9]:
def get_embeddings(texts, model="text-embedding-3-small"):
    """
    Fetches embeddings using the specified backend: 'gpt' (OpenAI) or 'sentence-transformers'.
    
    Args:
        texts (list of str): List of text inputs.
        backend (str): 'gpt' or 'sentence-transformers'.
        model (str): Model name for the chosen backend.
        
    Returns:
        list: List of embeddings.
    """
 # Make sure `openai` is configured with your API key
    batch_size = 200
    embeddings = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Fetching GPT embeddings", unit="batch"):
        batch = texts[i : i + batch_size]
        response = client.embeddings.create(input=batch, model=model)
        batch_embeddings = [entry.embedding for entry in response.data]
        embeddings.extend(batch_embeddings)

    return embeddings

In [None]:
embedding_model = "text-embedding-3-large" 
os.makedirs(f'{embedding_model}_results', exist_ok=True)
os.makedirs('gpt_embeddings', exist_ok=True)
embedding_list = get_embeddings(df_new['topic'], model=embedding_model)


In [None]:
os.makedirs(f'{embedding_model}_reduced_embeddings', exist_ok=True)

In [None]:
np.save("gpt_embeddings/WOS_grouped_embed.npy",embedding_list)

In [10]:
embedding_list = np.load("gpt_embeddings/WOS_grouped_embed.npy")

In [21]:
embedding_list.shape

(46985, 3072)

In [11]:
shuffle_idx = np.random.RandomState(seed=42).permutation(len(df_new))
# Shuffle both documents and embeddings using the same index
topic_data = df_new.iloc[shuffle_idx].reset_index(drop=True)
data = np.array(embedding_list)[shuffle_idx]
reverse_idx = np.argsort(shuffle_idx)

In [10]:
# sample_size = int(0.20 * len(topic_data))
# topic_data_sample = topic_data.iloc[:sample_size].reset_index(drop=True)
# data_sample = data[:sample_size]

In [12]:
topic_dict = {}
for col in topic_data.columns:
    if re.match(r'^category \d+$', col): 
        unique_count = len(topic_data[col].unique())
        topic_dict[unique_count] = np.array(topic_data[col])

In [17]:
reducer_model = phate.PHATE(n_jobs=-2,random_state=42, n_components=300,decay=20,t="auto",n_pca=None)
embed_phate = reducer_model.fit_transform(data)


Calculating PHATE...
  Running PHATE on 46985 observations and 3072 variables.
  Calculating graph and diffusion operator...
    Calculating KNN search...
    Calculated KNN search in 687.58 seconds.
    Calculating affinities...
    Calculated affinities in 567.51 seconds.
  Calculated graph and diffusion operator in 1255.41 seconds.
  Calculating landmark operator...
    Calculating SVD...
    Calculated SVD in 7.65 seconds.
    Calculating KMeans...
    Calculated KMeans in 2.90 seconds.
  Calculated landmark operator in 11.81 seconds.
  Calculating optimal t...
    Automatically selected t = 31
  Calculated optimal t in 0.72 seconds.
  Calculating diffusion potential...
  Calculated diffusion potential in 0.57 seconds.
  Calculating metric MDS...
  Calculated metric MDS in 149.19 seconds.
Calculated PHATE in 1417.88 seconds.


In [31]:

np.save(f"{embedding_model}_reduced_embeddings/PHATE_WOS_embed.npy",np.array(embed_phate))

In [19]:
embed_phate = np.load(f"{embedding_model}_reduced_embeddings/PHATE_WOS_embed.npy")

In [13]:
depth= 2
cluster_levels=[]
for i in reversed(range(0, depth)):
    cluster_levels.append(len(topic_data[f'category {i}'].unique()))

In [20]:
include_pca =True
include_umap=True

# Load your embeddings
embeddings = np.array(data)
embedding_methods = {}
# PCA to 2D

embedding_methods["PHATE"]  =embed_phate


In [15]:

if include_pca:
    pca = PCA(n_components=300)
    embedding_methods["PCA"] = pca.fit_transform(embeddings)
np.save(f"{embedding_model}_reduced_embeddings/PCA_WOS_embed.npy",embedding_methods["PCA"])

# # UMAP to 2D
if include_umap:
    umap_model = umap.UMAP(n_components=300, random_state=42,min_dist=.05,n_neighbors=10)
    embedding_methods["UMAP"] = umap_model.fit_transform(embeddings)
np.save(f"{embedding_model}_reduced_embeddings/UMAP_WOS_embed_new.npy",embedding_methods["UMAP"])

# # # Fit t-SNE
tsne_model = TSNE(n_components=3, random_state=42)
embedding_methods["tSNE"] = tsne_model.fit_transform(embeddings)
np.save(f"{embedding_model}_reduced_embeddings/tSNE_WOS_embed.npy",embedding_methods["tSNE"])

In [18]:
embedding_methods["PCA"]= np.load(f"{embedding_model}_reduced_embeddings/PCA_WOS_embed.npy")

embedding_methods["UMAP"] =  np.load(f"{embedding_model}_reduced_embeddings/UMAP_WOS_embed_new.npy")


In [21]:
scores_all = defaultdict(lambda: defaultdict(list))

for embed_name, embed_data in embedding_methods.items():
    for cluster_method in ["Agglomerative", "HDBSCAN","DC"]:
        for level in cluster_levels:
            
            # Clustering
            if cluster_method == "Agglomerative":
                model = AgglomerativeClustering(n_clusters=level)
                model.fit(embed_data)
                labels = model.labels_
            elif cluster_method == "HDBSCAN":
                model = hdbscan.HDBSCAN(min_cluster_size=level)
                model.fit(embed_data)
                labels = model.labels_
                Z = model.single_linkage_tree_.to_numpy()
                labels = fcluster(Z, i, criterion='maxclust')
                labels[labels == -1] = labels.max() + 1
            elif cluster_method=="DC":
                model = dc(min_clusters=level, max_iterations=5000,k=10,alpha=4,t=3)
                model.fit(embed_data)
                labels  =model.labels_
                

            # Use topic_dict for comparison
            available_levels = sorted(topic_dict.keys())
            closest_level = min(available_levels, key=lambda k: abs(k - level))

            topic_series = topic_dict[closest_level]
            valid_idx = ~pd.isna(topic_series)

            target_lst = topic_series[valid_idx]
            label_lst = labels[valid_idx]

            # Compute metrics
            try:
                fm_score = FowlkesMallows.Bk({level: target_lst}, {level: label_lst})[level]['FM']
            except:
                fm_score = np.nan  # In case of failure

            scores_all[(embed_name, cluster_method)]["FM"].append(fm_score)
            scores_all[(embed_name, cluster_method)]["Rand"].append(rand_score(target_lst, label_lst))
            scores_all[(embed_name, cluster_method)]["ARI"].append(adjusted_rand_score(target_lst, label_lst))

In [22]:

rows = []

for (embed_name, cluster_method), score_dict in scores_all.items():
    n_levels = len(score_dict["FM"])  # assuming all score lists have same length
    for i in range(n_levels):
        rows.append({
            "reduction_method": embed_name,
            "cluster_method": cluster_method,
            "level": cluster_levels[i],  # assumes scores were appended in order
            "FM": score_dict["FM"][i],
            "Rand": score_dict["Rand"][i],
            "ARI": score_dict["ARI"][i],
            "Params":"{'k':10,'alpha':4,'t':3}"
        })

# Create DataFrame
scores_df = pd.DataFrame(rows)

# Optional: sort for easier viewing
scores_df = scores_df.sort_values(by=["reduction_method", "cluster_method", "level"]).reset_index(drop=True)
write_header = not os.path.exists(f'{embedding_model}_results/other_WOS_results.csv')
scores_df.to_csv(f"{embedding_model}_results/other_WOS_results.csv",mode='a', index=False, header=write_header)

In [None]:
import json
with open("combo_color_map.json", 'r') as file:
        combo_color_map = json.load(file)

In [None]:
import matplotlib.pyplot as plt

metrics = ['FM', 'Rand', 'ARI']

for metric in metrics:
    plt.figure(figsize=(10, 6))

    for (embed_name, method), metric_scores in scores_all.items():
        if method=="DC":
            method="Diffusion Condensation"
        combo_key = f"{embed_name}_{method}"
        plt.plot(
            cluster_levels, 
            metric_scores[metric], 
            marker='o', 
            label=f"{embed_name} {method}",
            color= combo_color_map.get(combo_key, 'black')
        )
    
    plt.title(f"{metric} Score Across Cluster Levels")
    plt.xlabel("Cluster Level")
    plt.ylabel(metric)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()