# arXiv Version (Structured Like Your WOS Pipeline)

### 1. Load arXiv Metadata (Streaming – Don’t Load 1.5GB Fully)

In [1]:
import json
import pandas as pd
from tqdm import tqdm

# This is the big arXiv metadata file (it's huge, so we stream it)
file_path = "../../../../arxiv-metadata-oai-snapshot.json"

records = []  # We’ll store only the papers we actually care about

# Read the file line by line so we don’t crash the computer
with open(file_path, "r") as f:
    for line in tqdm(f):  # Just to see progress because this takes a minute
        paper = json.loads(line)
        
        # We only want Computer Science and Physics papers
        # That matches our hierarchy goal and keeps things manageable
        if paper["categories"].startswith(("cs.", "physics.")):
            
            # Combine title + abstract into one text field for embeddings later
            records.append({
                "topic": paper["title"] + " " + paper["abstract"],
                "categories": paper["categories"]
            })

# Turn everything into a DataFrame so we can work with it easily
df_arxiv = pd.DataFrame(records)

# Quick check to make sure it loaded correctly
df_arxiv.head()

# Show full dataset
df_arxiv


2951540it [00:24, 122045.85it/s]


Unnamed: 0,topic,categories
0,The evolution of the Earth-Moon system based o...,physics.gen-ph
1,Convergence of the discrete dipole approximati...,physics.optics physics.comp-ph
2,Convergence of the discrete dipole approximati...,physics.optics physics.comp-ph
3,The discrete dipole approximation for simulati...,physics.optics physics.comp-ph
4,The discrete dipole approximation: an overview...,physics.optics physics.comp-ph
...,...,...
924080,"Variational methods, multiprecision and nonrel...",physics.atom-ph physics.comp-ph
924081,Effective interaction between helical bio-mole...,physics.bio-ph physics.chem-ph physics.comp-ph...
924082,Atom-optics hologram in the time domain The ...,physics.atom-ph physics.optics
924083,A Second-Order Stochastic Leap-Frog Algorithm ...,physics.comp-ph


# 2. arXiv Dataset Subsampling
The full arXiv metadata snapshot contains over 900,000 physics-related records, which is computationally expensive to process for embedding generation and dimensionality reduction. Generating embeddings for the entire dataset would significantly increase runtime, memory usage, and API costs without providing meaningful additional evaluation benefits for this study.
To ensure computational feasibility while preserving hierarchical diversity, we randomly sampled 30,000 papers from the filtered Physics subset. This sample size is sufficient to:
Maintain a rich hierarchical structure across subject categories
Enable robust clustering evaluation
Provide statistically meaningful benchmarking results
Keep embedding and PHATE computation tractable
The sampling procedure was performed using a fixed random seed to ensure reproducibility.

In [6]:
# The full dataset is way too big to embed, so we randomly sample 30,000 papers.
# random_state=42 keeps it reproducible (so we always get the same sample).
# reset_index just cleans up the index after sampling.
df_arxiv = df_arxiv.sample(30000, random_state=42).reset_index(drop=True)
df_arxiv

Unnamed: 0,topic,categories
0,Optimized Cloud Resource Allocation Using Gene...,cs.DC cs.AI
1,Control of Rayleigh-like waves in thick plate ...,physics.geo-ph cond-mat.mtrl-sci
2,Deep Text Classification Can be Fooled In th...,cs.CR cs.LG
3,Clip-TTS: Contrastive Text-content and Mel-spe...,cs.SD cs.AI cs.CL cs.HC cs.LG eess.AS
4,Convex Cauchy Schwarz Independent Component An...,cs.IT math.IT
...,...,...
29995,HyperAttention: Long-context Attention in Near...,cs.LG cs.AI
29996,Maximum Likelihood Estimation of Power-law Deg...,cs.SI physics.data-an physics.soc-ph
29997,SCL(FOL) Can Simulate Non-Redundant Superposit...,cs.LO cs.AI cs.SC
29998,Submodlib: A Submodular Optimization Library ...,cs.LG cs.IR


In [7]:
# Display the first 5 rows of the DataFrame to verify structure and contents
df_arxiv.head

<bound method NDFrame.head of                                                    topic  \
0      Optimized Cloud Resource Allocation Using Gene...   
1      Control of Rayleigh-like waves in thick plate ...   
2      Deep Text Classification Can be Fooled   In th...   
3      Clip-TTS: Contrastive Text-content and Mel-spe...   
4      Convex Cauchy Schwarz Independent Component An...   
...                                                  ...   
29995  HyperAttention: Long-context Attention in Near...   
29996  Maximum Likelihood Estimation of Power-law Deg...   
29997  SCL(FOL) Can Simulate Non-Redundant Superposit...   
29998  Submodlib: A Submodular Optimization Library  ...   
29999  Deep Learning Method to Predict Wound Healing ...   

                                  categories  
0                                cs.DC cs.AI  
1           physics.geo-ph cond-mat.mtrl-sci  
2                                cs.CR cs.LG  
3      cs.SD cs.AI cs.CL cs.HC cs.LG eess.AS  
4             

# 3. Extract Primary Category + Top Domain

In [8]:
# Function to extract category hierarchy information
# - primary: first listed arXiv category (most representative)
# - top_level: broad domain (e.g., "cs" or "physics")
def extract_categories(cat_string):
    primary = cat_string.split()[0]          # first category in the list
    top_level = primary.split('.')[0]       # extract top-level domain
    return top_level, primary


# Apply the function to create two new columns:
# "category 0" = top-level domain
# "category 1" = primary sub-category
df_arxiv[["category 0", "category 1"]] = df_arxiv["categories"].apply(
    lambda x: pd.Series(extract_categories(x))
)

# Keep only the columns needed for embedding + hierarchical evaluation
df_arxiv = df_arxiv[["topic", "category 0", "category 1"]]

# Preview updated structure
df_arxiv.head()



Unnamed: 0,topic,category 0,category 1
0,Optimized Cloud Resource Allocation Using Gene...,cs,cs.DC
1,Control of Rayleigh-like waves in thick plate ...,physics,physics.geo-ph
2,Deep Text Classification Can be Fooled In th...,cs,cs.CR
3,Clip-TTS: Contrastive Text-content and Mel-spe...,cs,cs.SD
4,Convex Cauchy Schwarz Independent Component An...,cs,cs.IT


# Install Required Libraries

In [9]:
# Install or upgrade required libraries for transformer-based embeddings
# - transformers: model + tokenizer loading
# - torch: deep learning backend
# - accelerate: efficient model execution
# - sentencepiece: tokenizer support for some models (e.g., Qwen)
!pip install -U transformers torch accelerate sentencepiece



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.3[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [10]:
df_arxiv

Unnamed: 0,topic,category 0,category 1
0,Optimized Cloud Resource Allocation Using Gene...,cs,cs.DC
1,Control of Rayleigh-like waves in thick plate ...,physics,physics.geo-ph
2,Deep Text Classification Can be Fooled In th...,cs,cs.CR
3,Clip-TTS: Contrastive Text-content and Mel-spe...,cs,cs.SD
4,Convex Cauchy Schwarz Independent Component An...,cs,cs.IT
...,...,...,...
29995,HyperAttention: Long-context Attention in Near...,cs,cs.LG
29996,Maximum Likelihood Estimation of Power-law Deg...,cs,cs.SI
29997,SCL(FOL) Can Simulate Non-Redundant Superposit...,cs,cs.LO
29998,Submodlib: A Submodular Optimization Library ...,cs,cs.LG


# 4. Load SentenceTransformer (Qwen)

In [11]:
# 4. Load SentenceTransformer (Qwen)
from sentence_transformers import SentenceTransformer
import torch
import numpy as np
import os

model_name = "Qwen/Qwen3-Embedding-0.6B"
device = "cuda" if torch.cuda.is_available() else "cpu"

print("Using device:", device)

model = SentenceTransformer(model_name, device=device)

embedding_array = model.encode(
    df_arxiv["topic"].tolist(),
    batch_size=32,          # 64 if strong GPU
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True
)

print("Embedding shape:", embedding_array.shape)

Using device: cpu


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]



Loading weights:   0%|          | 0/310 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

Batches:   0%|          | 0/938 [00:00<?, ?it/s]

KeyboardInterrupt: 

# 5. Save embeddings

In [None]:
import os
print(os.getcwd())


/Users/sukainaalkhalidy/Desktop/CMSE 495 capstone/NCEAS_Unsupervised_NLP/src/data/arxiv


In [14]:
np.save("arxiv_qwen_embeddings.npy", embedding_array)
print("Saved in:", os.getcwd())


NameError: name 'embedding_array' is not defined