# Docling lab
## Summary
The idea of this project is to test how to use docling to create a vector store and use it with langchain.
## Tools
- [Docling](https://docling.ai/)
- [Langchain](https://python.langchain.com/)
- [Chroma](https://www.trychroma.com/)
- [Langsmith](https://www.langchain.com/docs/integrations/langsmith)

In [2]:
import chromadb
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_openai import OpenAI

model = OpenAI(temperature=0, model="gpt-3.5-turbo-0613")
client = chromadb.HttpClient(host='localhost', port=8000)
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vector_store = Chroma(
    collection_name="llm-rag-ai",
    embedding_function=embeddings,
    client=client
)
client.heartbeat()

1741473230802584755

### Docling test
Loading CSV file

In [9]:
from langchain_docling import DoclingLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.csv_loader import CSVLoader
import os
from pathlib import Path
import chromadb
from docling import Document, Corpus
file_path = "../data/ai-llm"



# Initialize ChromaDB client
chroma_client = chromadb.Client()

# Create or get a collection
collection = chroma_client.create_collection(name="pdf_documents")

def process_directory(directory_path):
    """
    Load all PDFs from a directory into ChromaDB using docling

    Args:
        directory_path: Path to directory containing PDF files
    """
    # Create a corpus to hold all documents
    corpus = Corpus()

    # Get all PDF files in the directory
    pdf_files = list(Path(directory_path).glob("*.pdf"))

    if not pdf_files:
        print(f"No PDF files found in {directory_path}")
        return

    print(f"Found {len(pdf_files)} PDF files")

    # Process each PDF file
    for pdf_path in pdf_files:
        print(f"Processing {pdf_path.name}...")

        # Load the PDF using docling
        try:
            doc = Document.from_file(pdf_path)

            # Add document to corpus
            corpus.add_document(doc)

            # Extract text content and metadata
            document_id = str(pdf_path.name)
            document_text = doc.text
            metadata = {
                "filename": pdf_path.name,
                "path": str(pdf_path),
                "size_bytes": os.path.getsize(pdf_path)
            }

            # Split into chunks for better retrieval (optional)
            # Simple sentence splitting for demonstration
            sentences = document_text.split(". ")

            # Add each chunk to ChromaDB with its metadata
            for i, chunk in enumerate(sentences):
                if len(chunk.strip()) > 10:  # Skip very short chunks
                    collection.add(
                        ids=[f"{document_id}_chunk_{i}"],
                        documents=[chunk.strip()],
                        metadatas=[{**metadata, "chunk_id": i, "total_chunks": len(sentences)}]
                    )

            print(f"Added {len(sentences)} chunks from {pdf_path.name} to ChromaDB")

        except Exception as e:
            print(f"Error processing {pdf_path.name}: {e}")

if __name__ == "__main__":
    # Replace with your PDF directory path
    pdf_directory = "path/to/your/pdf/directory"

    # Process all PDFs in the directory
    process_directory(pdf_directory)

    # Example query to verify documents were loaded
    results = collection.query(
        query_texts=["your search query here"],
        n_results=5
    )

    print("\nQuery results:", results)

['85648219-5380-40d6-8180-1e6a3b76e141',
 '541c6d38-5bcf-47eb-b2c3-fafa934284af',
 '4c28478c-f8dd-4c14-8a5f-bd684dcc002b',
 'c1338f95-a695-4f8f-8c36-d655298632dc',
 '5fed526f-8daa-466d-9aa1-c84c3eeea13c',
 'e4f3b8b6-482e-462b-b672-7c7f68de63fd',
 'c8870fab-4082-4314-a6e3-b22437787fbe',
 'f76b1a58-aa6a-4478-aa48-1038a6298475',
 '7a8b006b-04df-43e5-8f23-3453abc5264e',
 'ea284a07-e80e-48d5-85bd-73463a584cf1',
 'ae148fee-cf06-4cf6-85c3-c3ae82f97848',
 '68eedfec-2ce7-4c4a-bd89-6b8b6055a729',
 '49dd65d8-7717-4196-b3b4-40ca3fbdffeb',
 '4eae6cce-0c45-4577-8088-a53bb5e2e328',
 'd9fe91cc-592f-4b32-b015-1480b8a09c3e',
 '4f1f1ad2-f9a1-48be-84d4-0cc3d5ef34f8',
 '16b4798a-04ca-49f3-ad0e-ed57f3c18a03',
 'dacca9d8-841f-47cf-8dd3-a1c587d6f5da',
 'f4e1c7ae-fb39-4267-a95c-86981cfee73a',
 '2fdeebff-b947-4aac-8e9a-5c9ae1edbe97',
 '374cd80e-0711-43f2-819f-d802f64bcd92',
 'd21727f2-f600-48e8-9e6b-e1e2a2425c98',
 'bb0c3084-dc4d-42ff-908a-7c3da6bd359d',
 'ded2c2b9-1339-4b4f-b072-bea07ab47a13',
 '993b30f6-6242-

In [15]:
from langchain_experimental.agents import create_csv_agent
from langchain_openai import ChatOpenAI, OpenAI

agent = create_csv_agent(
    OpenAI(temperature=0),
    file_path,
    verbose=True,
    allow_dangerous_code=True,
    agent_type="zero-shot-react-description",  # Use this for complex reasoning
    prefix="You are a financial analyst specializing in sales data."  # Add domain-specific context

)
response = agent.invoke("cuanto fue el total de ventas en junio 2024?")
print(response)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to find the total sales in June 2024.
Action: python_repl_ast
Action Input: df[df['Fecha'].dt.month == 6 & df['Fecha'].dt.year == 2024]['Total'].sum()[0m[36;1m[1;3mKeyError: 'Fecha'[0m[32;1m[1;3mI need to check the column names in the dataframe.
Action: python_repl_ast
Action Input: df.columns[0m[36;1m[1;3mIndex(['Código producto', 'Nombre producto', 'Referencia fábrica',
       'Grupo inventario', 'Cantidad vendida', 'Valor bruto', 'Descuento',
       'Subtotal', 'Impuesto cargo', 'Impuesto retención', 'Total'],
      dtype='object')[0m[32;1m[1;3mI need to use the correct column name for the date.
Action: python_repl_ast
Action Input: df[df['Fecha'].dt.month == 6 & df['Fecha'].dt.year == 2024]['Total'].sum()[0m[36;1m[1;3mKeyError: 'Fecha'[0m[32;1m[1;3mI need to check the column names again.
Action: python_repl_ast
Action Input: df.columns[0m[36;1m[1;3mIndex(['Código producto', 'Nombre pro