# Arxiv database embedder
This notebook imports the kaggle dataset of Arxiv papers, stores them in a Quadrant DB, and adds an extra column with the text embeddings of the abstracts using Qwen3-Embedding-8B model from huggingface's transformers library.

In [None]:
!pip install qdrant-client
from qdrant_client import QdrantClient
from google.colab import userdata

apiKey = userdata.get("QUADRANT_KEY")

qdrant_client = QdrantClient(
    url="https://ab493be3-1f1f-4d1b-9f6d-150d5daca89d.sa-east-1-0.aws.cloud.qdrant.io:6333", 
    api_key=apiKey
)

print(qdrant_client.get_collections())

ModuleNotFoundError: No module named 'qdrant_client'

In [None]:
# Get the ArXiv papers dataset from kaggle
!pip install kagglehub
import kagglehub

# Download latest version
path = kagglehub.dataset_download("Cornell-University/arxiv")

print("Path to dataset files:", path)

In [None]:
#import the embedding model
from sentence_transformers import SentenceTransformer
# Load the model
model = SentenceTransformer("Qwen/Qwen3-Embedding-8B")
#Test the model
emb = model.encode("Hello, how are you?")
print("Embedding generated successfully:", emb)
print("Embedding dimension:", len(emb))

In [None]:
# Create the qdrant collection
from qdrant_client.models import VectorParams, Distance, PointStruct
import numpy as np
import pandas as pd

if not qdrant_client.collection_exists("test_arxiv_papers"):
   qdrant_client.create_collection(
      collection_name="test_arxiv_papers",
      vectors_config=VectorParams(size=32, distance=Distance.COSINE),
   )
print("Collection 'arxiv_papers' created successfully.")

"""
EXAMPLE
vectors = np.random.rand(100, 100)
client.upsert(
   collection_name="my_collection",
   points=[
      PointStruct(
            id=idx,
            vector=vector.tolist(),
            payload={"color": "red", "rand_number": idx % 10}
      )
      for idx, vector in enumerate(vectors)
   ]
)
"""
#dump the arxiv database into qdrant adding embeddings from the abstracts

df = pd.read_csv('/content/arxiv/arxiv-metadata-oai-snapshot.csv', nrows=10)  # Load a sample of 10 rows for demonstration
for index, row in df.iterrows():
    abstract = row['abstract']
    absembedding = model.encode(abstract).tolist()
    point = PointStruct(
        id=index,
        vector=embedding,
        payload={
            "id": row['id'],
            "submitter": row['submitter'],
            "title": row['title'],
            "journal_ref": row['journal_ref'],
            "doi": row['doi'],
            "report-no": row['report-no'],
            "categories": row['categories'],
            "license": row['license'],
            "abstract": row['abstract'],
            "update_date": row['update_date'],
        }
    )
    qdrant_client.upsert(
        collection_name="test_arxiv_papers",
        points=[point]
    )

ModuleNotFoundError: No module named 'qdrant_client'