In [1]:
import json
from pathlib import Path
import os
import time
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pydantic_settings import BaseSettings, SettingsConfigDict

class Settings(BaseSettings):
    model_config = SettingsConfigDict(
        env_file="../.env", env_file_encoding="utf-8", extra="ignore"
    )
    embedding_base_url: str
    embedding_api_key: str
    embedding_model: str
    embedding_model_dir: str
    
    data_dir: str
    docling_model_dir: str
    
settings = Settings()
os.environ["HF_HOME"] = settings.docling_model_dir

In [2]:
import sys
sys.path.append("src")

from core.base.schema import TextNode, ImageNode, TableNode, TextType, TextLabel, Document
from core.reader.pdf.docling_reader import DoclingPDFReader
from core.processor.document.text_merger import TextNodeMerger
from core.splitter.text.langchain_text_splitters import LangchainRecursiveCharacterTextSplitter
from core.formatter.document.simple import SimpleTextOnlyFormatter

In [3]:
reader = DoclingPDFReader()

In [4]:
file_path = "resources/1706.03762v7.pdf"

In [5]:
# 1. Run Reader
document = reader.run(file_path)

In [6]:
# 2. Run Processor (merge)
nodes = document.nodes
print(len(nodes))

merger = TextNodeMerger()
document = merger.run(document)
len(document.nodes)

136


26

In [7]:
# 3. Run Splitter
splitter = LangchainRecursiveCharacterTextSplitter(
    chunk_size = 1024,
    chunk_overlap = 128
)

chunks = []
source_id = document.id_
for i, node in enumerate(document.nodes):
    if isinstance(node, TextNode):
        split_nodes = splitter.run(node)
    else:
        split_nodes = [node]
    chunk = Document(
        nodes=split_nodes,
        metadata={"source_id": source_id}
    )
    chunks.append(chunk)
print(len(chunks))

26


In [8]:
# 4. Format (Prepare Embedding Input)
# use default templates
formatter = SimpleTextOnlyFormatter()
formatted_texts = formatter.run(chunks)

In [9]:
type(chunks[0])
formatted_texts[2]

''

In [10]:
def select_embedding_input_idxs(texts: str, min_length: int = 20):
    return [i for i, x in enumerate(texts) if len(x.strip())>min_length]

embedding_input_idxs = select_embedding_input_idxs(
    texts=formatted_texts,
    min_length=20
)
print(len(embedding_input_idxs))

14


In [11]:
# 5. Embed
from openai import OpenAI
from core.embedder.openai.text_embedder import OpenAITextEmbedder

client = OpenAI(
    base_url=settings.embedding_base_url,
    api_key=settings.embedding_api_key
)

embedder = OpenAITextEmbedder(
    client = client
)

embedding_inputs = [formatted_texts[x] for x in embedding_input_idxs]
embeddings = embedder.run(
    texts=embedding_inputs,
    model=settings.embedding_model,
    batch_size = 8
)

In [12]:
print(len(embeddings), len(embeddings[0]))

14 1024


In [13]:
len(embeddings)

14

In [14]:
# 6. Add to VectorStore
from qdrant_client import QdrantClient
from core.storage.vectorstore.qdrant.single import QdrantSingleVectorStore

# initialize client
client = QdrantClient(":memory:")
collection_name = "test"

vector_store = QdrantSingleVectorStore(
    collection_name=collection_name,
    client=client
)

In [None]:
from qdrant_client.http import models

embedding_dim = len(embeddings[0])

vector_store.create(
    on_disk_payload=True,  # store the payload on disk
    vectors_config = models.VectorParams(
        size=embedding_dim,
        distance=models.Distance.COSINE,
        on_disk=True,
    )
)

In [16]:
vector_store.add(
    documents=[chunks[x] for x in embedding_input_idxs],
    embeddings=embeddings,
    metadata_keys=["source_id"]
)

In [None]:
# check collection
collection_info = vector_store._client.get_collection(
    collection_name=vector_store.collection_name
)
print(collection_info.model_dump_json(indent=4))

In [21]:
# check point
points = vector_store._client.retrieve(
    collection_name=vector_store.collection_name,
    ids=[chunks[0].id_],
    with_vectors=True
)

In [23]:
print(points[0].id)
print(points[0].payload)
print(len(points[0].vector))

d72324bb-06a3-4a08-af47-0e7007e58bc2
{'source_id': 'fdcfcef0-43e3-4d1a-af9a-4e305db0eb08'}
1024
