# Prepare Qdrant vector database for RAG

## Install Packages

In [1]:
!pip install pymupdf

Collecting pymupdf
  Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m73.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.4


In [2]:
!pip install qdrant-client

Collecting qdrant-client
  Downloading qdrant_client-1.15.1-py3-none-any.whl.metadata (11 kB)
Collecting portalocker<4.0,>=2.7.0 (from qdrant-client)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Downloading qdrant_client-1.15.1-py3-none-any.whl (337 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m337.3/337.3 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, qdrant-client
Successfully installed portalocker-3.2.0 qdrant-client-1.15.1


In [3]:
pip install openai tqdm



## Initialize Qdrant

In [4]:
from qdrant_client import QdrantClient, models

In [5]:
QDRANT_URL = 'XXXX'
QDRANT_API_KEY = 'XXXX'

In [6]:
COLLECTION_NAME = "harry_potter"

In [None]:
qdrant_client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)

collection_name = COLLECTION_NAME
qdrant_client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(size=3072, distance=models.Distance.COSINE)
)

print(f"Collection '{collection_name}' created successfully.")

## Load PDF file and convert into chunks

In [14]:
import pymupdf
import re
import nltk
from typing import List, Dict, Any, Tuple

In [15]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [16]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [17]:
def load_pdf_text_with_context(pdf_path: str, book_title: str) -> Tuple[List[Tuple[str, int]], str]:
    # Load pdf file into text with page number
    page_texts_with_context = []
    try:
        doc = pymupdf.open(pdf_path)
        for page_num, page in enumerate(doc, start=1):
            text = page.get_text("text")
            page_texts_with_context.append((text, page_num))
    except Exception as e:
        print(f"Error loading PDF: {e}")
    return page_texts_with_context, book_title

def clean_text(text: str) -> str:
    # Clean text of multiple whitespaces
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def create_chunks_with_metadata(
    page_texts_with_context: List[Tuple[str, int]],
    book_title: str,
    chunk_size: int = 1000,
    overlap: int = 200
) -> List[Dict[str, Any]]:
    # Cut text into chunks with overlap, avoid cut mid sentence, include metadata
    chunk_list = []

    # Process text page by page to get page number
    for page_text, page_num in page_texts_with_context:
        cleaned_text = clean_text(page_text)
        sentences = nltk.sent_tokenize(cleaned_text)
        current_chunk_sentences = []
        current_word_count = 0

        for sentence in sentences:
            words = sentence.split()
            sentence_word_count = len(words)

            # When chunk reaches the defined chunk size
            if current_word_count + sentence_word_count > chunk_size and current_chunk_sentences:
                chunk_text = " ".join(current_chunk_sentences)
                chunk_list.append({
                    "text": chunk_text,
                    "metadata": {
                        "source": book_title,
                        "page_number": page_num,
                        "start_sentence": current_chunk_sentences[0][:30] + "...",
                    }
                })

                # Implement overlap for the next chunk
                overlap_word_count = 0
                temp_buffer = []
                # Keep adding sentences to the buffer until exceed the overlap word count
                for s in reversed(current_chunk_sentences):
                    s_words = s.split()
                    if overlap_word_count + len(s_words) <= overlap:
                        temp_buffer.insert(0, s)
                        overlap_word_count += len(s_words)
                    else:
                        # If no sentences in the buffer yet, need to include this one,
                        if not temp_buffer:
                            temp_buffer.insert(0, s)
                            overlap_word_count += len(s_words)
                        break

                current_chunk_sentences = temp_buffer
                current_word_count = overlap_word_count

            # Add the current sentence to the chunk
            current_chunk_sentences.append(sentence)
            current_word_count += sentence_word_count

        # Finalize the last chunk of the page
        if current_chunk_sentences:
            chunk_text = " ".join(current_chunk_sentences)
            chunk_list.append({
                "text": chunk_text,
                "metadata": {
                    "source": book_title,
                    "page_number": page_num,
                    "start_sentence": current_chunk_sentences[0][:30] + "...",
                }
            })

    return chunk_list

In [18]:
PDF_PATH = 'harry_potter_the_complete_collection.pdf'

BOOK_TITLE = "Harry Potter: The Complete Collection"


page_data, title = load_pdf_text_with_context(PDF_PATH, BOOK_TITLE)

chunks_with_meta = create_chunks_with_metadata(
    page_texts_with_context=page_data,
    book_title=title,
    chunk_size=1000,
    overlap=200
)


In [21]:
chunks_with_meta[2262]

{'text': 'till-dawn end-of-exams celebration in the common room. Harry barely heard them. He scrambled through the portrait hole while they were still arguing about how many black-market butterbeers they would need and was climbing back out of it, the Invisibility Cloak and Sirius’s knife secure in his bag, before they noticed he had left them. “Harry, d’you want to chip in a couple of Galleons? Harold Dingle reckons he could sell us some firewhisky . . .” But Harry was already tearing away back along the corridor, and a couple of minutes later was jumping the last few stairs to join Ron, Hermione, Ginny, and Luna, who were huddled together at the end of Umbridge’s corridor. “Got it,” he panted. “Ready to go, then?” “All right,” whispered Hermione as a gang of loud sixth years passed them. “So Ron — you go and head Umbridge off. . . . Ginny, Luna, if you can start moving people out of the corridor. . . . Harry and I will get the Cloak on and wait until the coast is clear . . .” Ron str

In [26]:
chunks_with_meta[0]["metadata"]

{'source': 'Harry Potter: The Complete Collection',
 'page_number': 6,
 'start_sentence': 'CONTENTS Harry Potter and the ...'}

## Create sentence embeddings with OpenAI embedding model

In [29]:
from openai import OpenAI
from typing import List, Dict, Any
from qdrant_client.models import PointStruct
import time

In [23]:
OPENAI_KEY = 'XXXX'

In [24]:
EMBEDDING_DIMENSION = 3072
EMBEDDING_MODEL_NAME = "text-embedding-3-large"
BATCH_SIZE = 512

In [30]:
def batch_embed_and_create_points(chunks: List[Dict[str, Any]]) -> List[PointStruct]:
    # Batch embed chunks using embedding model and create pointstruct for qdrant upsert

    openai_client = OpenAI(api_key=OPENAI_KEY)

    all_qdrant_points = []

    for i in range(0, len(chunks), BATCH_SIZE):
        batch = chunks[i:i + BATCH_SIZE]
        batch_texts = [chunk['text'] for chunk in batch]

        try:
            # Create embedding for each batch
            print(f"Embedding batch {i//BATCH_SIZE + 1} of {len(chunks) // BATCH_SIZE + 1}...")
            embedding_response = openai_client.embeddings.create(
                input=batch_texts,
                model=EMBEDDING_MODEL_NAME,
                dimensions=EMBEDDING_DIMENSION
            )

            # Create poinstruct
            for j, chunk in enumerate(batch):
                vector = embedding_response.data[j].embedding
                payload = chunk["metadata"].copy()
                payload["content"] = chunk["text"] # Store the full text chunk

                global_index = i + j

                all_qdrant_points.append(
                    PointStruct(
                        id=global_index,
                        vector=vector,
                        payload=payload,
                    )
                )

        except Exception as e:
            print(f"An error occurred during embedding: {e}")
            continue

        print('Sleeping for 5s')
        time.sleep(5) # avoids rate limit

    return all_qdrant_points

In [31]:
all_qdrant_points = batch_embed_and_create_points(chunks_with_meta)

Embedding batch 1 of 8...
Sleeping for 5s
Embedding batch 2 of 8...
Sleeping for 5s
Embedding batch 3 of 8...
Sleeping for 5s
Embedding batch 4 of 8...
Sleeping for 5s
Embedding batch 5 of 8...
Sleeping for 5s
Embedding batch 6 of 8...
Sleeping for 5s
Embedding batch 7 of 8...
Sleeping for 5s
Embedding batch 8 of 8...
Sleeping for 5s


## Upsert vectors to Qdrant

In [33]:
QDRANT_UPSERT_BATCH_SIZE = 256

In [34]:
def batch_upsert_points(client: QdrantClient, COLLECTION_NAME: str, QDRANT_UPSERT_BATCH_SIZE: int, points: List[PointStruct]):
    total_points = len(points)
    print(f"Total points to upsert: {total_points}. Upserting in batches of {QDRANT_UPSERT_BATCH_SIZE}.")

    for i in range(0, total_points, QDRANT_UPSERT_BATCH_SIZE):
        batch = points[i:i + QDRANT_UPSERT_BATCH_SIZE]

        print(f"-> Upserting batch {i//QDRANT_UPSERT_BATCH_SIZE + 1} ({len(batch)} points)...")

        try:
            operation_info = client.upsert(
                collection_name=COLLECTION_NAME,
                wait=True,
                points=batch,
            )
            print(f"   Batch {i//QDRANT_UPSERT_BATCH_SIZE + 1} upserted. Status: {operation_info.status.name}")

        except Exception as e:
            print(f"   Error upserting batch {i//QDRANT_UPSERT_BATCH_SIZE + 1}: {e}")
            continue

    print("All point batches processed.")

In [35]:
batch_upsert_points(qdrant_client, COLLECTION_NAME, QDRANT_UPSERT_BATCH_SIZE, all_qdrant_points)

Total points to upsert: 3604. Upserting in batches of 256.
-> Upserting batch 1 (256 points)...
   Batch 1 upserted. Status: COMPLETED
-> Upserting batch 2 (256 points)...
   Batch 2 upserted. Status: COMPLETED
-> Upserting batch 3 (256 points)...
   Batch 3 upserted. Status: COMPLETED
-> Upserting batch 4 (256 points)...
   Batch 4 upserted. Status: COMPLETED
-> Upserting batch 5 (256 points)...
   Batch 5 upserted. Status: COMPLETED
-> Upserting batch 6 (256 points)...
   Batch 6 upserted. Status: COMPLETED
-> Upserting batch 7 (256 points)...
   Batch 7 upserted. Status: COMPLETED
-> Upserting batch 8 (256 points)...
   Batch 8 upserted. Status: COMPLETED
-> Upserting batch 9 (256 points)...
   Batch 9 upserted. Status: COMPLETED
-> Upserting batch 10 (256 points)...
   Batch 10 upserted. Status: COMPLETED
-> Upserting batch 11 (256 points)...
   Batch 11 upserted. Status: COMPLETED
-> Upserting batch 12 (256 points)...
   Batch 12 upserted. Status: COMPLETED
-> Upserting batch 13 (

In [40]:
retrieved_points = qdrant_client.retrieve(
    collection_name=COLLECTION_NAME,
    ids=[1000],
    with_payload=True
)

In [41]:
retrieved_points

[Record(id=1000, payload={'source': 'Harry Potter: The Complete Collection', 'page_number': 1014, 'start_sentence': 'then” — Mr. Weasley handed ove...', 'content': 'then” — Mr. Weasley handed over the kettle and a couple of saucepans — “and the rest of us will get some wood for a fire?” “But we’ve got an oven,” said Ron. “Why can’t we just —” “Ron, anti-Muggle security!” said Mr. Weasley, his face shining with anticipation. “When real Muggles camp, they cook on fires outdoors. I’ve seen them at it!” After a quick tour of the girls’ tent, which was slightly smaller than the boys’, though without the smell of cats, Harry, Ron, and Hermione set off across the campsite with the kettle and saucepans. Now, with the sun newly risen and the mist lifting, they could see the city of tents that stretched in every direction. They made their way slowly through the rows, staring eagerly around. It was only just dawning on Harry how many witches and wizards there must be in the world; he had never re