In [1]:
import os
import json
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv

from qdrant_client import QdrantClient, models

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()

True

In [24]:
DATA_PATH = os.getenv("DATA_PATH", "../data")

QDRANT_URL = os.getenv("QDRANT_URL_LOCAL")
# MODEL_HANDLE = os.getenv("MODEL_NAME")
COLLECTION_NAME = os.getenv("COLLECTION_NAME")

In [25]:
EMBEDDING_DIMENSIONALITY = 512
qd_client = QdrantClient(QDRANT_URL)
model_handle = 'jinaai/jina-embeddings-v2-small-en'
collection_name = COLLECTION_NAME

In [5]:
def load_documents():
    with open(f'{DATA_PATH}/docs-with-ids.json', 'rt') as f_in:
        documents = json.load(f_in)
    return documents

In [6]:
def load_ground_truth():
    df_ground_truth = pd.read_csv(f'{DATA_PATH}/ground-truth-data.csv')
    ground_truth = df_ground_truth.to_dict(orient="records")
    return ground_truth

In [7]:
def initialize_qdrant_collection(collection_name, qd_client, embedding_dim: int = EMBEDDING_DIMENSIONALITY):
    qd_client.delete_collection(collection_name=collection_name)

    qd_client.create_collection(
        collection_name=collection_name,
        vectors_config=models.VectorParams(
            size=embedding_dim,
            distance=models.Distance.COSINE
        )
    )

    print(f"Qdrant collection '{collection_name}' has been created and initialized.")

In [8]:
def build_qdrant_points(documents, model_handle):
    points = []

    for i, doc in enumerate(documents):
        text = doc['question'] + ' ' + doc['text']
        vector = models.Document(text=text, model=model_handle)

        point = models.PointStruct(
            id=i,
            vector=vector,
            payload=doc
        )

        points.append(point)

    return points

In [9]:
def upsert_documents_to_qdrant(qd_client, collection_name, points):
    qd_client.upsert(
        collection_name=collection_name,
        points=points
    )
    print(f"Upserted {len(points)} documents to Qdrant collection '{collection_name}'.")


In [None]:
def main():
    print("Starting Qdrant indexing process...")

    documents = load_documents()
    print(f"Loaded {len(documents)} documents.")

    ground_truth = load_ground_truth()
    print(f"Loaded {len(ground_truth)} ground truth.")

    initialize_qdrant_collection(collection_name, qd_client)

    points = build_qdrant_points(documents, model_handle)
    print(f"Built {len(points)} Qdrant points.")

    upsert_documents_to_qdrant(qd_client, collection_name, points)

    print("Qdrant indexing process completed successfully.")


In [26]:
main()

Starting Qdrant indexing process...
Loaded 98 documents.
Loaded 486 ground truth.
Qdrant collection 'running-faq' has been created and initialized.
Built 98 Qdrant points.
Upserted 98 documents to Qdrant collection 'running-faq'.
Qdrant indexing process completed successfully.
