# Imports

In [1]:
import os
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel


# Load Data

In [9]:
# This is the big arXiv metadata file (it's huge, so we stream it)
import json
import pandas as pd
from tqdm import tqdm
import random
file_path = "../../../../arxiv-metadata-oai-snapshot.json"

records = []  # We’ll store only the papers we actually care about

# Read the file line by line so we don’t crash the computer
with open(file_path, "r") as f:
    for line in tqdm(f):  # Just to see progress because this takes a minute
        paper = json.loads(line)
        
        # We only want Computer Science and Physics papers
        # That matches our hierarchy goal and keeps things manageable
        if paper["categories"].startswith(("cs.", "physics.")):
            
            # Combine title + abstract into one text field for embeddings later
            records.append({
                "topic": paper["title"] + " " + paper["abstract"],
                "categories": paper["categories"]
            })

# Turn everything into a DataFrame so we can work with it easily
df_arxiv = pd.DataFrame(records)

# Quick check to make sure it loaded correctly
df_arxiv.head()

# Show full dataset
df_arxiv


2951540it [00:24, 122569.77it/s]


Unnamed: 0,topic,categories
0,The evolution of the Earth-Moon system based o...,physics.gen-ph
1,Convergence of the discrete dipole approximati...,physics.optics physics.comp-ph
2,Convergence of the discrete dipole approximati...,physics.optics physics.comp-ph
3,The discrete dipole approximation for simulati...,physics.optics physics.comp-ph
4,The discrete dipole approximation: an overview...,physics.optics physics.comp-ph
...,...,...
924080,"Variational methods, multiprecision and nonrel...",physics.atom-ph physics.comp-ph
924081,Effective interaction between helical bio-mole...,physics.bio-ph physics.chem-ph physics.comp-ph...
924082,Atom-optics hologram in the time domain The ...,physics.atom-ph physics.optics
924083,A Second-Order Stochastic Leap-Frog Algorithm ...,physics.comp-ph


In [10]:
# The full dataset is way too big to embed, so we randomly sample 30,000 papers.
# random_state=42 keeps it reproducible (so we always get the same sample).
# reset_index just cleans up the index after sampling.
df_arxiv = df_arxiv.sample(30000, random_state=42).reset_index(drop=True)
df_arxiv

Unnamed: 0,topic,categories
0,Semantic Agreement Enables Efficient Open-Ende...,cs.CL
1,Scheduling in Grid Computing Environment Sch...,cs.DC
2,Taking off the Rose-Tinted Glasses: A Critical...,cs.LG cs.CR
3,Traffic Performance Score for Measuring the Im...,physics.soc-ph
4,SueNes: A Weakly Supervised Approach to Evalua...,cs.CL cs.IR cs.LG
...,...,...
29995,Improving Neural Machine Translation by Multi-...,cs.CL
29996,Strong Exciton-Vibrational Coupling in Molecul...,physics.chem-ph quant-ph
29997,Duality of generalized twisted Reed-Solomon co...,cs.IT math.IT
29998,Non-consensus opinion models on complex networ...,physics.soc-ph cs.SI


In [14]:
df_arxiv.columns


Index(['topic', 'category_0', 'category_1'], dtype='object')

In [15]:
import os

os.makedirs("data/arxiv", exist_ok=True)

df_arxiv.to_csv("data/arxiv/arxiv_30k.csv", index=False)

print("Saved arxiv_30k.csv successfully")


Saved arxiv_30k.csv successfully


# Embedding

# Benchmark (Mirror Amazon)