In [None]:
import re
from pypdf import PdfReader
from chromadb import PersistentClient

In [None]:
with PdfReader('../data/test-producto-1.pdf') as reader:
    text = ''
    print(f'Reading document with {len(reader.pages)} pages')
    for page in reader.pages:
        text += page.extract_text()
product_1_text = re.sub(r'\s+', ' ', text)

In [None]:
with PdfReader('../data/test-producto-2.pdf') as reader:
    text = ''
    print(f'Reading document with {len(reader.pages)} pages')
    for page in reader.pages:
        text += page.extract_text()
product_2_text = re.sub(r'\s+', ' ', text)

In [None]:
len(product_1_text), len(product_2_text)

In [None]:
chunks = []
start = 0

while start < len(product_1_text):
    end = start + 500 # chunk_size
    chunk = product_1_text[start:end]
    chunks.append(chunk)
    start += 500 - 30 # chunk_size - overlap

chunks[0], chunks[1]

In [None]:
def text_splitter(text: str, chunk_size: int = 500, overlap: int = 30) -> list[str]:
    chunks = []
    start = 0

    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start += chunk_size - overlap
    return chunks

In [None]:
documents = []
for text in [product_1_text, product_2_text]:
    chunks = text_splitter(text)
    documents.extend(chunks)

print(f'Number of documents: {len(documents)}')


In [None]:
client = PersistentClient('../chroma')

In [None]:
collection = client.get_or_create_collection('products')

In [None]:
import uuid
ids = [str(uuid.uuid4()) for _ in range(len(documents))]
ids[:2]

In [None]:
collection.add(
    ids=ids,
    documents=documents
)

In [None]:
results = collection.query(
    query_texts=['Qué alérgenos contienen las patatas fritas?'],
    n_results=3
)
results

In [None]:
for document in results.get('documents')[0]: 
    print(document)
    print()

In [None]:
from queue import Queue
from contextlib import contextmanager

In [None]:
class ChromaDbClient:

    def __init__(self, db_path: str = './chroma', pool_size: int = 10):
        self.pool = Queue(maxsize=pool_size)
        succesful_connections = 0
        for i in range(pool_size):
            try:
                client = PersistentClient(db_path)
                self.pool.put(client)
                succesful_connections += 1
            except Exception as e:
                print(f'Error while creating connection {i+1}: {str(e)}')
        
        if self.pool.empty():
            raise RuntimeError('Unable to establish connections with the database')
        
        print(f'{succesful_connections}/{pool_size} connections established')
    
    def close(self):
        while not self.pool.empty():
            self.pool.get_nowait()

    def __enter__(self):
        return self
    
    def __exit__(self, exc_type, exc_value, traceback):
        self.close()

    @contextmanager
    def acquire(self, collection_name: str, timeout: float = 30):
        try:
            client = self.pool.get(timeout=timeout)
        except Empty:
            raise RuntimeError('No available connections in pool')
        try:
            yield client.get_or_create_collection(collection_name)
        finally:
            self.pool.put(client)


In [None]:
client = ChromaDbClient(db_path='../chroma')
with client.acquire('products') as collection:
    results = collection.query(
        query_texts=['Qué alérgenos contienen las patatas fritas?'],
        n_results=3
    )
results