# arXiv Version (Structured Like Your WOS Pipeline)

### Load arXiv Metadata (Streaming – Don’t Load 1.5GB Fully)

In [6]:
import json
import pandas as pd
from tqdm import tqdm

file_path = "../../../../arxiv-metadata-oai-snapshot.json"



records = []

# Stream safely (important for large file)
with open(file_path, "r") as f:
    for line in tqdm(f):
        paper = json.loads(line)
        
        # Only keep CS + Physics (like taxonomy goal)
        if paper["categories"].startswith(("cs.", "physics.")):
            records.append({
                "topic": paper["title"] + " " + paper["abstract"],
                "categories": paper["categories"]
            })

# Convert to DataFrame
df_arxiv = pd.DataFrame(records)
df_arxiv.head()
df_arxiv

2951540it [00:21, 134433.67it/s]


Unnamed: 0,topic,categories
0,The evolution of the Earth-Moon system based o...,physics.gen-ph
1,Convergence of the discrete dipole approximati...,physics.optics physics.comp-ph
2,Convergence of the discrete dipole approximati...,physics.optics physics.comp-ph
3,The discrete dipole approximation for simulati...,physics.optics physics.comp-ph
4,The discrete dipole approximation: an overview...,physics.optics physics.comp-ph
...,...,...
924080,"Variational methods, multiprecision and nonrel...",physics.atom-ph physics.comp-ph
924081,Effective interaction between helical bio-mole...,physics.bio-ph physics.chem-ph physics.comp-ph...
924082,Atom-optics hologram in the time domain The ...,physics.atom-ph physics.optics
924083,A Second-Order Stochastic Leap-Frog Algorithm ...,physics.comp-ph


# arXiv Dataset Subsampling
The full arXiv metadata snapshot contains over 900,000 physics-related records, which is computationally expensive to process for embedding generation and dimensionality reduction. Generating embeddings for the entire dataset would significantly increase runtime, memory usage, and API costs without providing meaningful additional evaluation benefits for this study.
To ensure computational feasibility while preserving hierarchical diversity, we randomly sampled 30,000 papers from the filtered Physics subset. This sample size is sufficient to:
Maintain a rich hierarchical structure across subject categories
Enable robust clustering evaluation
Provide statistically meaningful benchmarking results
Keep embedding and PHATE computation tractable
The sampling procedure was performed using a fixed random seed to ensure reproducibility.

In [7]:
df_arxiv = df_arxiv.sample(30000, random_state=42).reset_index(drop=True)
df_arxiv

Unnamed: 0,topic,categories
0,Semantic Agreement Enables Efficient Open-Ende...,cs.CL
1,Scheduling in Grid Computing Environment Sch...,cs.DC
2,Taking off the Rose-Tinted Glasses: A Critical...,cs.LG cs.CR
3,Traffic Performance Score for Measuring the Im...,physics.soc-ph
4,SueNes: A Weakly Supervised Approach to Evalua...,cs.CL cs.IR cs.LG
...,...,...
29995,Improving Neural Machine Translation by Multi-...,cs.CL
29996,Strong Exciton-Vibrational Coupling in Molecul...,physics.chem-ph quant-ph
29997,Duality of generalized twisted Reed-Solomon co...,cs.IT math.IT
29998,Non-consensus opinion models on complex networ...,physics.soc-ph cs.SI


In [8]:
df_arxiv.head

<bound method NDFrame.head of                                                    topic  \
0      Semantic Agreement Enables Efficient Open-Ende...   
1      Scheduling in Grid Computing Environment   Sch...   
2      Taking off the Rose-Tinted Glasses: A Critical...   
3      Traffic Performance Score for Measuring the Im...   
4      SueNes: A Weakly Supervised Approach to Evalua...   
...                                                  ...   
29995  Improving Neural Machine Translation by Multi-...   
29996  Strong Exciton-Vibrational Coupling in Molecul...   
29997  Duality of generalized twisted Reed-Solomon co...   
29998  Non-consensus opinion models on complex networ...   
29999  Energy-limited Joint Source--Channel Coding vi...   

                     categories  
0                         cs.CL  
1                         cs.DC  
2                   cs.LG cs.CR  
3                physics.soc-ph  
4             cs.CL cs.IR cs.LG  
...                         ...  
29995      

# Extract Primary Category + Top Domain

In [9]:
def extract_categories(cat_string):
    primary = cat_string.split()[0]  # first category
    top_level = primary.split('.')[0]
    return top_level, primary

df_arxiv[["category 0", "category 1"]] = df_arxiv["categories"].apply(
    lambda x: pd.Series(extract_categories(x))
)

df_arxiv = df_arxiv[["topic", "category 0", "category 1"]]
df_arxiv.head()


Unnamed: 0,topic,category 0,category 1
0,Semantic Agreement Enables Efficient Open-Ende...,cs,cs.CL
1,Scheduling in Grid Computing Environment Sch...,cs,cs.DC
2,Taking off the Rose-Tinted Glasses: A Critical...,cs,cs.LG
3,Traffic Performance Score for Measuring the Im...,physics,physics.soc-ph
4,SueNes: A Weakly Supervised Approach to Evalua...,cs,cs.CL


# Generate Embeddings (Same as WOS)
Important: 30k embeddings with text-embedding-3-large will cost money.
If cost is a concern, use "text-embedding-3-small" for arXiv.

# Generate GPT Embeddings for arXiv (30k Only)

In [10]:
!pip install --upgrade openai


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.3[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


Install OpenAI (Correct Way)

In [11]:
import sys
!{sys.executable} -m pip install --upgrade openai



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.3[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


# Do NOT Use dotenv Right Now

In [19]:
from openai import OpenAI

client = OpenAI(api_key="Somekey")


In [20]:
test = get_embeddings(df_arxiv["topic"].tolist()[:5])
len(test)


Fetching GPT embeddings: 100%|██████████| 1/1 [00:01<00:00,  1.39s/batch]


5

# Define function

In [21]:
from tqdm import tqdm

def get_embeddings(texts, model="text-embedding-3-small"):
    batch_size = 200
    embeddings = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Fetching GPT embeddings", unit="batch"):
        batch = texts[i : i + batch_size]
        response = client.embeddings.create(input=batch, model=model)
        batch_embeddings = [entry.embedding for entry in response.data]
        embeddings.extend(batch_embeddings)

    return embeddings


# Run Embeddings

In [22]:
embedding_model = "text-embedding-3-large"

embedding_list = get_embeddings(
    df_arxiv["topic"].tolist(),
    model=embedding_model
)

embedding_array = np.array(embedding_list, dtype=np.float32)

np.save("gpt_embeddings/arxiv_embed.npy", embedding_array)

embedding_array.shape


Fetching GPT embeddings:  27%|██▋       | 40/150 [01:36<04:26,  2.42s/batch]


RateLimitError: Error code: 429 - {'error': {'message': 'Request too large for text-embedding-3-large in organization org-wGEUWom3eHxPOUq8KfncOLIs on tokens per min (TPM): Limit 40000, Requested 60066. The input or output tokens must be reduced in order to run successfully. Visit https://platform.openai.com/account/rate-limits to learn more. You can increase your rate limit by adding a payment method to your account at https://platform.openai.com/account/billing.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}