In [1]:
from langchain_ollama import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_core.documents import Document
from langchain_chroma import Chroma

import uuid
from langchain_community.document_loaders import PyPDFLoader

In [2]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=510, 
    chunk_overlap=50,
    length_function=len,
    separators=["\n", ".", "\n\n"]
)

In [3]:
# Probar distintos modelos de embeddings
embeddings_generator = OllamaEmbeddings(
    model='mxbai-embed-large:latest',
)

In [18]:
chroma_db_path = "./database/chroma_db"

vector_store = Chroma(
    persist_directory=chroma_db_path,
    embedding_function=embeddings_generator,
    collection_name="kanji"
)

In [None]:
# Documentos txt
txt_path = './database/docs/txt/kanji-1.txt'
txt_text = ''

with open(txt_path, 'r', encoding='utf-8') as file:
    txt_text = file.read()

print(len(txt_text))

4790


In [24]:
text_chunks = text_splitter.split_text(txt_text)
print(len(text_chunks))

12


In [25]:
for chunk in text_chunks:
    document = Document(
        id=str(uuid.uuid4()),
        page_content=chunk,
        metadata={
            'name': 'kanji',
            'source': 'https://es.wikipedia.org/wiki/Kanji',
            'generation': 6,
            'image_src': 'https://upload.wikimedia.org/wikipedia/commons/thumb/9/95/Kanji_furigana.svg/163px-Kanji_furigana.svg.png'
        }
    )

    vector_store.add_documents([document])

In [28]:
vector_store.similarity_search("Que es un kanji?", k=5)

ValueError: Chroma collection not initialized. Use `reset_collection` to re-create and initialize the collection. 

In [None]:
pdf_document = PyPDFLoader('./database/docs/PDF/KanjiParaRecordar.pdf')
pdf_pages = pdf_document.load()

In [None]:
pdf_chunks = text_splitter.split_documents(pdf_pages)
print(len(pdf_chunks))

In [None]:
for chunk in pdf_chunks:
    chunk.id = str(uuid.uuid4())
    chunk.metadata = {
        "name": "definicion"
    }

In [None]:
vector_store.add_documents(pdf_chunks)

In [None]:
vector_store.similarity_search("Que es la megaevolucion?", k=5)

In [27]:

# Para eliminar la colección
vector_store.delete_collection()
