# arXiv Version (Structured Like Your WOS Pipeline)

### Load arXiv Metadata (Streaming – Don’t Load 1.5GB Fully)

In [5]:
import json
import pandas as pd
from tqdm import tqdm

# This is the big arXiv metadata file (it's huge, so we stream it)
file_path = "../../../../arxiv-metadata-oai-snapshot.json"

records = []  # We’ll store only the papers we actually care about

# Read the file line by line so we don’t crash the computer
with open(file_path, "r") as f:
    for line in tqdm(f):  # Just to see progress because this takes a minute
        paper = json.loads(line)
        
        # We only want Computer Science and Physics papers
        # That matches our hierarchy goal and keeps things manageable
        if paper["categories"].startswith(("cs.", "physics.")):
            
            # Combine title + abstract into one text field for embeddings later
            records.append({
                "topic": paper["title"] + " " + paper["abstract"],
                "categories": paper["categories"]
            })

# Turn everything into a DataFrame so we can work with it easily
df_arxiv = pd.DataFrame(records)

# Quick check to make sure it loaded correctly
df_arxiv.head()

# Show full dataset
df_arxiv


2951540it [00:23, 127505.48it/s]


Unnamed: 0,topic,categories
0,The evolution of the Earth-Moon system based o...,physics.gen-ph
1,Convergence of the discrete dipole approximati...,physics.optics physics.comp-ph
2,Convergence of the discrete dipole approximati...,physics.optics physics.comp-ph
3,The discrete dipole approximation for simulati...,physics.optics physics.comp-ph
4,The discrete dipole approximation: an overview...,physics.optics physics.comp-ph
...,...,...
924080,"Variational methods, multiprecision and nonrel...",physics.atom-ph physics.comp-ph
924081,Effective interaction between helical bio-mole...,physics.bio-ph physics.chem-ph physics.comp-ph...
924082,Atom-optics hologram in the time domain The ...,physics.atom-ph physics.optics
924083,A Second-Order Stochastic Leap-Frog Algorithm ...,physics.comp-ph


# arXiv Dataset Subsampling
The full arXiv metadata snapshot contains over 900,000 physics-related records, which is computationally expensive to process for embedding generation and dimensionality reduction. Generating embeddings for the entire dataset would significantly increase runtime, memory usage, and API costs without providing meaningful additional evaluation benefits for this study.
To ensure computational feasibility while preserving hierarchical diversity, we randomly sampled 30,000 papers from the filtered Physics subset. This sample size is sufficient to:
Maintain a rich hierarchical structure across subject categories
Enable robust clustering evaluation
Provide statistically meaningful benchmarking results
Keep embedding and PHATE computation tractable
The sampling procedure was performed using a fixed random seed to ensure reproducibility.

In [6]:
# The full dataset is way too big to embed, so we randomly sample 30,000 papers.
# random_state=42 keeps it reproducible (so we always get the same sample).
# reset_index just cleans up the index after sampling.
df_arxiv = df_arxiv.sample(30000, random_state=42).reset_index(drop=True)
df_arxiv

Unnamed: 0,topic,categories
0,Semantic Agreement Enables Efficient Open-Ende...,cs.CL
1,Scheduling in Grid Computing Environment Sch...,cs.DC
2,Taking off the Rose-Tinted Glasses: A Critical...,cs.LG cs.CR
3,Traffic Performance Score for Measuring the Im...,physics.soc-ph
4,SueNes: A Weakly Supervised Approach to Evalua...,cs.CL cs.IR cs.LG
...,...,...
29995,Improving Neural Machine Translation by Multi-...,cs.CL
29996,Strong Exciton-Vibrational Coupling in Molecul...,physics.chem-ph quant-ph
29997,Duality of generalized twisted Reed-Solomon co...,cs.IT math.IT
29998,Non-consensus opinion models on complex networ...,physics.soc-ph cs.SI


In [7]:
# Display the first 5 rows of the DataFrame to verify structure and contents
df_arxiv.head

<bound method NDFrame.head of                                                    topic  \
0      Semantic Agreement Enables Efficient Open-Ende...   
1      Scheduling in Grid Computing Environment   Sch...   
2      Taking off the Rose-Tinted Glasses: A Critical...   
3      Traffic Performance Score for Measuring the Im...   
4      SueNes: A Weakly Supervised Approach to Evalua...   
...                                                  ...   
29995  Improving Neural Machine Translation by Multi-...   
29996  Strong Exciton-Vibrational Coupling in Molecul...   
29997  Duality of generalized twisted Reed-Solomon co...   
29998  Non-consensus opinion models on complex networ...   
29999  Energy-limited Joint Source--Channel Coding vi...   

                     categories  
0                         cs.CL  
1                         cs.DC  
2                   cs.LG cs.CR  
3                physics.soc-ph  
4             cs.CL cs.IR cs.LG  
...                         ...  
29995      

# Extract Primary Category + Top Domain

In [8]:
# Function to extract category hierarchy information
# - primary: first listed arXiv category (most representative)
# - top_level: broad domain (e.g., "cs" or "physics")
def extract_categories(cat_string):
    primary = cat_string.split()[0]          # first category in the list
    top_level = primary.split('.')[0]       # extract top-level domain
    return top_level, primary


# Apply the function to create two new columns:
# "category 0" = top-level domain
# "category 1" = primary sub-category
df_arxiv[["category 0", "category 1"]] = df_arxiv["categories"].apply(
    lambda x: pd.Series(extract_categories(x))
)

# Keep only the columns needed for embedding + hierarchical evaluation
df_arxiv = df_arxiv[["topic", "category 0", "category 1"]]

# Preview updated structure
df_arxiv.head()



Unnamed: 0,topic,category 0,category 1
0,Semantic Agreement Enables Efficient Open-Ende...,cs,cs.CL
1,Scheduling in Grid Computing Environment Sch...,cs,cs.DC
2,Taking off the Rose-Tinted Glasses: A Critical...,cs,cs.LG
3,Traffic Performance Score for Measuring the Im...,physics,physics.soc-ph
4,SueNes: A Weakly Supervised Approach to Evalua...,cs,cs.CL


# Install Required Libraries

In [9]:
# Install or upgrade required libraries for transformer-based embeddings
# - transformers: model + tokenizer loading
# - torch: deep learning backend
# - accelerate: efficient model execution
# - sentencepiece: tokenizer support for some models (e.g., Qwen)
!pip install -U transformers torch accelerate sentencepiece


Collecting torch
  Using cached torch-2.10.0-2-cp312-none-macosx_11_0_arm64.whl.metadata (31 kB)
Collecting sympy>=1.13.3 (from torch)
  Using cached sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Using cached torch-2.10.0-2-cp312-none-macosx_11_0_arm64.whl (79.5 MB)
Using cached sympy-1.14.0-py3-none-any.whl (6.3 MB)
Installing collected packages: sympy, torch
[2K  Attempting uninstall: sympy
[2K    Found existing installation: sympy 1.13.1
[2K    Uninstalling sympy-1.13.1:
[2K      Successfully uninstalled sympy-1.13.1━━━━━[0m [32m0/2[0m [sympy]
[2K  Attempting uninstall: torch━━━━━━━━━━━━━━━━━━━[0m [32m0/2[0m [sympy]
[2K    Found existing installation: torch 2.6.0[0m [32m0/2[0m [sympy]
[2K    Uninstalling torch-2.6.0:[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━[0m [32m1/2[0m [torch]
[2K      Successfully uninstalled torch-2.6.0m━━━━━━━━━━━━━━━━━━━[0m [32m1/2[0m [torch]
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [torch]32m1/2[0m [torch]
[1

# Load Qwen Embedding Model

In [10]:
import transformers
print(transformers.__version__)


5.1.0


In [11]:
# Import Hugging Face utilities for loading pretrained models
from transformers import AutoTokenizer, AutoModel

# Specify the embedding model from Hugging Face Hub
model_name = "Qwen/Qwen3-Embedding-0.6B"

# Load tokenizer (handles text → token conversion)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load model weights
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
print("Model loaded successfully")



Loading weights:   0%|          | 0/310 [00:00<?, ?it/s]

Model loaded successfully


# Define Proper Embedding Function

In [12]:
import torch
import numpy as np
from tqdm import tqdm

# Function to generate embeddings for a list of texts using the loaded model
def embed_texts(texts, batch_size=32, max_length=512):
    all_embeddings = []

    # Process texts in batches to avoid memory overload
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size]

        # Tokenize text batch (convert text → model inputs)
        inputs = tokenizer(
            batch,
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors="pt"
        )

        # Move tensors to correct device (CPU or GPU)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Disable gradient computation (inference mode)
        with torch.no_grad():
            outputs = model(**inputs)

        # Convert token-level outputs to a single vector per text (mean pooling)
        embeddings = outputs.last_hidden_state.mean(dim=1)

        # Store batch embeddings
        all_embeddings.append(embeddings.cpu().numpy())

    # Combine all batches into one final embedding matrix
    return np.vstack(all_embeddings)


# Now Generate arXiv Embeddings (30k)

In [15]:
df_arxiv

Unnamed: 0,topic,category 0,category 1
0,Semantic Agreement Enables Efficient Open-Ende...,cs,cs.CL
1,Scheduling in Grid Computing Environment Sch...,cs,cs.DC
2,Taking off the Rose-Tinted Glasses: A Critical...,cs,cs.LG
3,Traffic Performance Score for Measuring the Im...,physics,physics.soc-ph
4,SueNes: A Weakly Supervised Approach to Evalua...,cs,cs.CL
...,...,...,...
29995,Improving Neural Machine Translation by Multi-...,cs,cs.CL
29996,Strong Exciton-Vibrational Coupling in Molecul...,physics,physics.chem-ph
29997,Duality of generalized twisted Reed-Solomon co...,cs,cs.IT
29998,Non-consensus opinion models on complex networ...,physics,physics.soc-ph


In [16]:
import torch

# Select device: use GPU if available, otherwise fall back to CPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# Move model to the selected device
model = model.to(device)

# Set model to evaluation mode (disables dropout, etc.)
model.eval()



Using device: cpu


Qwen3Model(
  (embed_tokens): Embedding(151669, 1024)
  (layers): ModuleList(
    (0-27): 28 x Qwen3DecoderLayer(
      (self_attn): Qwen3Attention(
        (q_proj): Linear(in_features=1024, out_features=2048, bias=False)
        (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
        (v_proj): Linear(in_features=1024, out_features=1024, bias=False)
        (o_proj): Linear(in_features=2048, out_features=1024, bias=False)
        (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
        (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
      )
      (mlp): Qwen3MLP(
        (gate_proj): Linear(in_features=1024, out_features=3072, bias=False)
        (up_proj): Linear(in_features=1024, out_features=3072, bias=False)
        (down_proj): Linear(in_features=3072, out_features=1024, bias=False)
        (act_fn): SiLUActivation()
      )
      (input_layernorm): Qwen3RMSNorm((1024,), eps=1e-06)
      (post_attention_layernorm): Qwen3RMSNorm((1024,), eps=1e-06)
    )
  )
  (norm): Qwen3RM

# arXiv Embedding Pipeline (HPCC Ready) consider batch_size=64 before run it 


In [17]:
# Generate embeddings for all 30k arXiv text entries
embedding_array = embed_texts(df_arxiv["topic"].tolist())

# Check the shape of the resulting embedding matrix
# Expected shape: (30000, embedding_dimension)
embedding_array.shape



  0%|          | 0/938 [02:31<?, ?it/s]


KeyboardInterrupt: 

# Save it

In [None]:
np.save("arxiv_qwen_embeddings.npy", embedding_array)
