In [5]:
import getpass
import os
from langchain_docling import DoclingLoader
from docling.chunking import HybridChunker
from langchain_docling import DoclingLoader
from langchain_docling.loader import ExportType
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_qdrant import QdrantVectorStore
from qdrant_client import models

from dotenv import load_dotenv


In [7]:

load_dotenv('./../.env')

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [8]:
def extract_simple_metadata(complex_metadata):
    """
    Wyciąga najważniejsze informacje z złożonej struktury metadanych Docling
    i przekształca je w prostą strukturę akceptowaną przez Chroma.
    
    Args:
        complex_metadata (dict): Oryginalna struktura metadanych
        
    Returns:
        dict: Uproszczona struktura metadanych
    """
    simple_metadata = {}
    
    # Zachowaj źródło dokumentu jeśli istnieje
    if 'source' in complex_metadata:
        simple_metadata['source'] = complex_metadata['source']
    
    # Wyciągnij informacje z dl_meta jeśli istnieje
    if 'dl_meta' in complex_metadata:
        dl_meta = complex_metadata['dl_meta']
        
        # Nazwa pliku
        if 'origin' in dl_meta and 'filename' in dl_meta['origin']:
            simple_metadata['filename'] = dl_meta['origin']['filename']
        
        # Typ MIME
        if 'origin' in dl_meta and 'mimetype' in dl_meta['origin']:
            simple_metadata['mimetype'] = dl_meta['origin']['mimetype']
        
        # Nagłówki
        if 'headings' in dl_meta and dl_meta['headings']:
            simple_metadata['headings'] = ', '.join(dl_meta['headings'])
        
        # Wyciągnij numery stron
        if 'doc_items' in dl_meta:
            pages = set()
            for item in dl_meta['doc_items']:
                if 'prov' in item:
                    for prov in item['prov']:
                        if 'page_no' in prov:
                            pages.add(prov['page_no'])
            
            if pages:
                simple_metadata['pages'] = ', '.join(map(str, sorted(pages)))
                simple_metadata['page_count'] = len(pages)
    
    return simple_metadata

def process_documents_metadata(documents):
    """
    Przetwarza metadane dla listy dokumentów, zwracając nową listę dokumentów
    z przetworzonymi metadanymi bez modyfikowania oryginalnych dokumentów.
    
    Args:
        documents (list): Lista dokumentów LangChain
        
    Returns:
        list: Nowa lista dokumentów z przetworzonymi metadanami
    """
    processed_docs = []
    for doc in documents:
        # Tworzymy kopię dokumentu zamiast modyfikować oryginał
        from copy import deepcopy
        new_doc = deepcopy(doc)
        new_doc.metadata = extract_simple_metadata(doc.metadata)
        processed_docs.append(new_doc)
    return processed_docs

In [9]:
import os

# Define the directory path for unprocessed data
UNPROCESSED_DATA_DIR = "./../data/unprocessed"
EXPORT_TYPE = ExportType.DOC_CHUNKS

# Get all files from the unprocessed data directory
file_paths = []
for file in os.listdir(UNPROCESSED_DATA_DIR):
    file_path = os.path.join(UNPROCESSED_DATA_DIR, file)
    if os.path.isfile(file_path):
        file_paths.append(file_path)

# Load documents using DoclingLoader
loader = DoclingLoader(
    file_path=file_paths,
    export_type=EXPORT_TYPE
)

docs = loader.load()

processed_documents = process_documents_metadata(docs)
for doc in processed_documents:
    print(doc.metadata)
    print("-"*100)
    print(doc.page_content)
    print("-"*100)
    


Token indices sequence length is longer than the specified maximum sequence length for this model (3018 > 512). Running this sequence through the model will result in indexing errors


{'source': './../data/unprocessed/waukesha.pdf', 'filename': 'waukesha.pdf', 'mimetype': 'application/pdf', 'pages': '1', 'page_count': 1}
----------------------------------------------------------------------------------------------------
Visit our internet site, waukeshaengine.dresser.com to locate the Waukesha Engine distributor nearest you.
Serial No.
Specification No.
----------------------------------------------------------------------------------------------------
{'source': './../data/unprocessed/waukesha.pdf', 'filename': 'waukesha.pdf', 'mimetype': 'application/pdf', 'headings': '/Ge2', 'pages': '1', 'page_count': 1}
----------------------------------------------------------------------------------------------------
/Ge2
Available to order in book format.
----------------------------------------------------------------------------------------------------
{'source': './../data/unprocessed/waukesha.pdf', 'filename': 'waukesha.pdf', 'mimetype': 'application/pdf', 'headings': 'F

In [None]:
# vector_store.add_documents(processed_documents)
vector_store = QdrantVectorStore.from_existing_collection(
    embedding=embeddings,
    collection_name=os.getenv("QDRANT_COLLECTION_NAME"),
    url=os.getenv("QDRANT_URL"),
    api_key=os.getenv("QDRANT_API_KEY")
)

results = vector_store.similarity_search(
    "How to clean and maintain the air filter?",
    k=5, 
    filter={
        "must": [
            {
                "key": "metadata.filename",
                "match": {"value": "caterpillar.pdf"}
                }] 
    }
    )

for result in results[:3]:
    print(result.page_content)
    print("-"*100)
    print(result.metadata)
    print("-"*100)
    print("-"*100)