# Arxiv database embedder
This notebook imports the kaggle dataset of Arxiv papers, stores them in a Quadrant DB, and adds an extra column with the text embeddings of the abstracts using Qwen3-Embedding-8B model from huggingface's transformers library.

In [None]:
!pip install qdrant-client
from qdrant_client import QdrantClient
from google.colab import userdata

apiKey = userdata.get("QUADRANT_KEY")

qdrant_client = QdrantClient(
    url="https://ab493be3-1f1f-4d1b-9f6d-150d5daca89d.sa-east-1-0.aws.cloud.qdrant.io:6333", 
    api_key=apiKey
)

print(qdrant_client.get_collections())

ModuleNotFoundError: No module named 'qdrant_client'

In [None]:
# Get the ArXiv papers dataset from kaggle
!pip install kagglehub
import kagglehub

# Download latest version
path = kagglehub.dataset_download("Cornell-University/arxiv")

print("Path to dataset files:", path)

In [None]:
#import the embedding model
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B")
#Test the model
emb = model.encode("Hello, how are you?")
vector_dim = len(emb)
print("Embedding generated successfully:", emb)
print("Embedding dimension:", len(emb))

In [None]:
import json
# Print the first lines of the dataset
with open("arxiv-metadata-oai-snapshot.json", "r") as file:

    for _ in range(5):
        line = file.readline()
        data = json.loads(line)
        print(data)

In [None]:
# Create the qdrant collection
from qdrant_client.models import VectorParams, Distance, PointStruct
import numpy as np
import pandas as pd
import json
from tqdm.auto import tqdm
batchSize = 100
collection_name = "arxiv_papers"
if not qdrant_client.collection_exists(collection_name):
   qdrant_client.create_collection(
      collection_name=collection_name,
      vectors_config=VectorParams(size=vector_dim, distance=Distance.COSINE),
   )
   print(f"Collection '{collection_name}' created successfully")
else:
  print(f"Collection '{collection_name}' already exists")
#dump the arxiv database into qdrant adding embeddings from the abstracts
# Fields to ignore
fields_to_ignore = {"id", "versions", "authors_parsed"}
# Count total lines (optional but gives accurate progress)
with open(kaggleDSPath + "/arxiv-metadata-oai-snapshot.json", "r") as f:
    totalLines = sum(1 for _ in f)

with open(kaggleDSPath + "/arxiv-metadata-oai-snapshot.json", "r") as file:
    batchPoints = []
    for idx, line in enumerate(tqdm(file, total=totalLines, desc="Ingesting")):
        entry = json.loads(line)
        payload = {key: value for key, value in entry.items() if key not in fields_to_ignore}
        abstract = entry.get("abstract", "")
        embedding = model.encode(abstract) if abstract else np.zeros(vector_dim)
        batchPoints.append(PointStruct(id=idx, vector=embedding.tolist(), payload=payload))

        if len(batchPoints) >= batchSize:
            qdrant_client.upsert(collection_name=collection_name, points=batchPoints)
            batchPoints = []

    # final flush
    if batchPoints:
        qdrant_client.upsert(collection_name=collection_name, points=batchPoints)

print("Data ingestion completed.")

ModuleNotFoundError: No module named 'qdrant_client'

In [None]:
#Get a recommendation from qdrant
query_abstract = " Deep learning techniques for natural language processing"
query_embedding = model.encode(query_abstract).tolist()
search_results = qdrant_client.search(
    collection_name=collection_name,
    query_vector=query_embedding,
    limit=5
)