# arXiv Qwen3 Embeddings (Phase 2.2)

This notebook generates open-source embeddings for the arXiv 30k dataset using:

Model: Qwen/Qwen3-Embedding-0.6B  
Framework: SentenceTransformers  

This replaces OpenAI embeddings to ensure full reproducibility.

In [4]:
import os
import warnings
import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer

warnings.filterwarnings("ignore")
np.random.seed(42)

In [5]:
dataset_name = "arxiv"
embedding_model = "Qwen3-Embedding-0.6B"

DATA_PATH = "arxiv_30k_clean.csv"
OUTPUT_DIR = "data/embeddings"

os.makedirs(OUTPUT_DIR, exist_ok=True)

In [6]:
print("Loading dataset...")
df = pd.read_csv(DATA_PATH)

texts = df["text"].astype(str).tolist()

print("Number of documents:", len(texts))
df.head()

Loading dataset...
Number of documents: 30000


Unnamed: 0,text,label
0,GenS: Generalizable Neural Surface Reconstruct...,cs.CV
1,Game-Theoretic and Machine Learning-based Appr...,cs.CR
2,Raman Beam Cleanup in Silicon in the Mid-Infra...,physics.optics
3,Complex event recognition under time constrain...,cs.DB
4,Visual Search at Pinterest We demonstrate th...,cs.CV


In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B", device=device)

Using device: cpu




model.safetensors:   0%|          | 0.00/1.19G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/310 [00:00<?, ?it/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

In [8]:
print("Generating embeddings...")

embeddings = model.encode(
    texts,
    batch_size=32,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True
)

print("Embedding shape:", embeddings.shape)

Generating embeddings...


Batches:   0%|          | 0/938 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
np.save(
    os.path.join(OUTPUT_DIR, "arxiv_qwen3_embeddings.npy"),
    embeddings
)

df.to_csv(
    os.path.join(OUTPUT_DIR, "arxiv_qwen3_metadata.csv"),
    index=False
)

print("Saved embeddings and metadata.")