# Imports

In [16]:
import os
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel


# Load Data

In [17]:
# This is the big arXiv metadata file (it's huge, so we stream it)
import json
import pandas as pd
from tqdm import tqdm
import random
file_path = "../../../../arxiv-metadata-oai-snapshot.json"

records = []  # We’ll store only the papers we actually care about

# Read the file line by line so we don’t crash the computer
with open(file_path, "r") as f:
    for line in tqdm(f):  # Just to see progress because this takes a minute
        paper = json.loads(line)
        
        # We only want Computer Science and Physics papers
        # That matches our hierarchy goal and keeps things manageable
        if paper["categories"].startswith(("cs.", "physics.")):
            
            # Combine title + abstract into one text field for embeddings later
            records.append({
                "topic": paper["title"] + " " + paper["abstract"],
                "categories": paper["categories"]
            })

# Turn everything into a DataFrame so we can work with it easily
df_arxiv = pd.DataFrame(records)

# Quick check to make sure it loaded correctly
df_arxiv.head()

# Show full dataset
df_arxiv


2951540it [00:22, 132700.94it/s]


Unnamed: 0,topic,categories
0,The evolution of the Earth-Moon system based o...,physics.gen-ph
1,Convergence of the discrete dipole approximati...,physics.optics physics.comp-ph
2,Convergence of the discrete dipole approximati...,physics.optics physics.comp-ph
3,The discrete dipole approximation for simulati...,physics.optics physics.comp-ph
4,The discrete dipole approximation: an overview...,physics.optics physics.comp-ph
...,...,...
924080,"Variational methods, multiprecision and nonrel...",physics.atom-ph physics.comp-ph
924081,Effective interaction between helical bio-mole...,physics.bio-ph physics.chem-ph physics.comp-ph...
924082,Atom-optics hologram in the time domain The ...,physics.atom-ph physics.optics
924083,A Second-Order Stochastic Leap-Frog Algorithm ...,physics.comp-ph


In [18]:
# The full dataset is way too big to embed, so we randomly sample 30,000 papers.
# random_state=42 keeps it reproducible (so we always get the same sample).
# reset_index just cleans up the index after sampling.
df_arxiv = df_arxiv.sample(30000, random_state=42).reset_index(drop=True)
df_arxiv

Unnamed: 0,topic,categories
0,Semantic Agreement Enables Efficient Open-Ende...,cs.CL
1,Scheduling in Grid Computing Environment Sch...,cs.DC
2,Taking off the Rose-Tinted Glasses: A Critical...,cs.LG cs.CR
3,Traffic Performance Score for Measuring the Im...,physics.soc-ph
4,SueNes: A Weakly Supervised Approach to Evalua...,cs.CL cs.IR cs.LG
...,...,...
29995,Improving Neural Machine Translation by Multi-...,cs.CL
29996,Strong Exciton-Vibrational Coupling in Molecul...,physics.chem-ph quant-ph
29997,Duality of generalized twisted Reed-Solomon co...,cs.IT math.IT
29998,Non-consensus opinion models on complex networ...,physics.soc-ph cs.SI


In [19]:
df_arxiv.columns


Index(['topic', 'categories'], dtype='object')

# Clean Labels

In [20]:
# Convert multi-label categories → single label (first category only)
df_arxiv["label"] = df_arxiv["categories"].str.split().str[0]

# Keep only text + label (Amazon format)
df_arxiv = df_arxiv[["topic", "label"]]
df_arxiv.columns = ["text", "label"]

# Save clean dataset
df_arxiv.to_csv("data/arxiv/arxiv_30k_clean.csv", index=False)

print("Saved cleaned arxiv dataset")
df_arxiv.head()


Saved cleaned arxiv dataset


Unnamed: 0,text,label
0,Semantic Agreement Enables Efficient Open-Ende...,cs.CL
1,Scheduling in Grid Computing Environment Sch...,cs.DC
2,Taking off the Rose-Tinted Glasses: A Critical...,cs.LG
3,Traffic Performance Score for Measuring the Im...,physics.soc-ph
4,SueNes: A Weakly Supervised Approach to Evalua...,cs.CL


# Load Clean Dataset for Embedding

In [21]:
df = pd.read_csv("data/arxiv/arxiv_30k_clean.csv")
texts = df["text"].tolist()
print("Loaded", len(texts), "documents")


Loaded 30000 documents


# Load Model

In [None]:
model_name = "sentence-transformers/all-mpnet-base-v2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("Model loaded on", device)


# Mean Pooling Function

In [None]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Embedding Loop

In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

embeddings = []
batch_size = 32

for i in tqdm(range(0, len(texts), batch_size)):
    batch = texts[i:i+batch_size]

    encoded = tokenizer(
        batch,
        padding=True,
        truncation=True,
        return_tensors="pt",
        max_length=512
    )

    encoded = {k: v.to(device) for k, v in encoded.items()}

    with torch.no_grad():
        model_output = model(**encoded)

    batch_embeddings = mean_pooling(model_output, encoded["attention_mask"])
    embeddings.append(batch_embeddings.cpu().numpy())

embeddings = np.vstack(embeddings)

print("Embedding shape:", embeddings.shape)



  0%|          | 0/938 [00:00<?, ?it/s]


NameError: name 'device' is not defined

In [11]:
import os

os.makedirs("data/arxiv", exist_ok=True)

df_arxiv.to_csv("data/arxiv/arxiv_30k.csv", index=False)

print("Saved arxiv_30k.csv successfully")


Saved arxiv_30k.csv successfully


# Save embeddings

In [None]:
np.save("data/arxiv/arxiv_30k_embeddings.npy", embeddings)