# Clean Imports

In [9]:
conda install numpy=2.2


Retrieving notices: done
Channels:
 - defaults
 - conda-forge
Platform: osx-arm64
Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /opt/anaconda3

  added / updated specs:
    - numpy=2.2


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    networkx-3.6.1             |  py312hca03da5_0         3.1 MB
    numpy-2.2.5                |  py312h40d09ce_2          11 KB
    numpy-base-2.2.5           |  py312h855c928_2         6.1 MB
    ------------------------------------------------------------
                                           Total:         9.2 MB

The following packages will be UPDATED:

  networkx                              3.3-py312hca03da5_0 --> 3.6.1-py312hca03da5_0 
  numpy                              1.26.4-py312h901140f_1 --> 2.2.5-py312h40d09ce_2 
  numpy-base                         1.26.4-py312hae0

In [4]:
import os
print(os.getcwd())


/Users/sukainaalkhalidy/Desktop/CMSE 495 capstone/NCEAS_Unsupervised_NLP/src/data/arxiv


In [11]:
import os
import sys

# Move up until we reach project root
target_folder = "NCEAS_Unsupervised_NLP"
current_dir = os.getcwd()

while os.path.basename(current_dir) != target_folder:
    parent_dir = os.path.abspath(os.path.join(current_dir, ".."))
    if parent_dir == current_dir:
        raise FileNotFoundError(f"{target_folder} not found in directory tree.")
    current_dir = parent_dir

os.chdir(current_dir)

# IMPORTANT: Add src to Python path
sys.path.insert(0, os.path.join(current_dir, "src"))

print("Project root set to:", current_dir)
print("src added to path:", os.path.join(current_dir, "src"))



Project root set to: /Users/sukainaalkhalidy/Desktop/CMSE 495 capstone/NCEAS_Unsupervised_NLP
src added to path: /Users/sukainaalkhalidy/Desktop/CMSE 495 capstone/NCEAS_Unsupervised_NLP/src


In [12]:
import sys
print(sys.path[:3])


['/Users/sukainaalkhalidy/Desktop/CMSE 495 capstone/NCEAS_Unsupervised_NLP/src', '/Users/sukainaalkhalidy/Desktop/CMSE 495 capstone/NCEAS_Unsupervised_NLP', '/opt/anaconda3/lib/python312.zip']


In [13]:
import os
import re
import numpy as np
import pandas as pd
from collections import defaultdict
import warnings
warnings.filterwarnings("ignore")

import phate
from sklearn.decomposition import PCA
import umap
from sklearn.cluster import AgglomerativeClustering
import hdbscan
from scipy.cluster.hierarchy import fcluster
from sklearn.metrics import adjusted_rand_score, rand_score
import pandas as pd
import numpy as np
import phate
import umap
import hdbscan

from custom_packages.diffusion_condensation import DiffusionCondensation as dc
from custom_packages.fowlkes_mallows import FowlkesMallows


# Load Data (No Streaming Now)

In [20]:
# This is the big arXiv metadata file (it's huge, so we stream it)
import json
import pandas as pd
from tqdm import tqdm
import random
file_path = "/Users/sukainaalkhalidy/Desktop/CMSE 495 capstone/arxiv-metadata-oai-snapshot.json"


records = []  # We’ll store only the papers we actually care about

# Read the file line by line so we don’t crash the computer
with open(file_path, "r") as f:
    for line in tqdm(f):  # Just to see progress because this takes a minute
        paper = json.loads(line)
        
        # We only want Computer Science and Physics papers
        # That matches our hierarchy goal and keeps things manageable
        if paper["categories"].startswith(("cs.", "physics.")):
            
            # Combine title + abstract into one text field for embeddings later
            records.append({
                "topic": paper["title"] + " " + paper["abstract"],
                "categories": paper["categories"]
            })

# Turn everything into a DataFrame so we can work with it easily
df_arxiv = pd.DataFrame(records)

# Quick check to make sure it loaded correctly
df_arxiv.head()

# Show full dataset
df_arxiv


2951540it [00:23, 127738.20it/s]


Unnamed: 0,topic,categories
0,The evolution of the Earth-Moon system based o...,physics.gen-ph
1,Convergence of the discrete dipole approximati...,physics.optics physics.comp-ph
2,Convergence of the discrete dipole approximati...,physics.optics physics.comp-ph
3,The discrete dipole approximation for simulati...,physics.optics physics.comp-ph
4,The discrete dipole approximation: an overview...,physics.optics physics.comp-ph
...,...,...
924080,"Variational methods, multiprecision and nonrel...",physics.atom-ph physics.comp-ph
924081,Effective interaction between helical bio-mole...,physics.bio-ph physics.chem-ph physics.comp-ph...
924082,Atom-optics hologram in the time domain The ...,physics.atom-ph physics.optics
924083,A Second-Order Stochastic Leap-Frog Algorithm ...,physics.comp-ph


In [21]:
# The full dataset is way too big to embed, so we randomly sample 30,000 papers.
# random_state=42 keeps it reproducible (so we always get the same sample).
# reset_index just cleans up the index after sampling.
df_arxiv = df_arxiv.sample(30000, random_state=42).reset_index(drop=True)
df_arxiv

Unnamed: 0,topic,categories
0,Semantic Agreement Enables Efficient Open-Ende...,cs.CL
1,Scheduling in Grid Computing Environment Sch...,cs.DC
2,Taking off the Rose-Tinted Glasses: A Critical...,cs.LG cs.CR
3,Traffic Performance Score for Measuring the Im...,physics.soc-ph
4,SueNes: A Weakly Supervised Approach to Evalua...,cs.CL cs.IR cs.LG
...,...,...
29995,Improving Neural Machine Translation by Multi-...,cs.CL
29996,Strong Exciton-Vibrational Coupling in Molecul...,physics.chem-ph quant-ph
29997,Duality of generalized twisted Reed-Solomon co...,cs.IT math.IT
29998,Non-consensus opinion models on complex networ...,physics.soc-ph cs.SI


In [23]:
def extract_categories(cat_string):
    primary = cat_string.split()[0]
    top_level = primary.split('.')[0]
    return top_level, primary

df_arxiv[["category_0", "category_1"]] = df_arxiv["categories"].apply(
    lambda x: pd.Series(extract_categories(x))
)

df_arxiv = df_arxiv[["topic", "category_0", "category_1"]]
df_arxiv.head()

Unnamed: 0,topic,category_0,category_1
0,Semantic Agreement Enables Efficient Open-Ende...,cs,cs.CL
1,Scheduling in Grid Computing Environment Sch...,cs,cs.DC
2,Taking off the Rose-Tinted Glasses: A Critical...,cs,cs.LG
3,Traffic Performance Score for Measuring the Im...,physics,physics.soc-ph
4,SueNes: A Weakly Supervised Approach to Evalua...,cs,cs.CL


# Build topic_dict

In [24]:
topic_dict = {}

for col in df_arxiv.columns:
    if re.match(r'^category_\d+$', col):
        unique_count = len(df_arxiv[col].unique())
        topic_dict[unique_count] = np.array(df_arxiv[col])


# Define cluster levels

In [28]:
depth = 2
cluster_levels = []

for i in reversed(range(depth)):
    cluster_levels.append(len(df_arxiv[f'category_{i}'].unique()))

print("Cluster levels:", cluster_levels)



Cluster levels: [62, 2]


# Load Qwen Embeddings
# You Are Now Ready For:
Step 4 — Load Qwen embeddings

In [26]:
embeddings = np.load("gpt_embeddings/arxiv_qwen_embeddings.npy")
print("Embeddings shape:", embeddings.shape)


FileNotFoundError: [Errno 2] No such file or directory: 'gpt_embeddings/arxiv_qwen_embeddings.npy'

# Create embedding_methods (THIS WAS MISSING)
# Build embedding_methods
Then:
PHATE
PCA
UMAP

In [27]:
embedding_methods = {}

# PHATE
phate_model = phate.PHATE(n_components=300, random_state=42)
embedding_methods["PHATE"] = phate_model.fit_transform(embeddings)

# PCA
pca = PCA(n_components=300)
embedding_methods["PCA"] = pca.fit_transform(embeddings)

# UMAP
umap_model = umap.UMAP(n_components=300, random_state=42)
embedding_methods["UMAP"] = umap_model.fit_transform(embeddings)


NameError: name 'embeddings' is not defined