In [1]:
collection_name = "chap_03"
connection_string = "http://localhost:6333"
book_title = "Genesi"

In [2]:
from langchain_community.document_loaders import TextLoader

# Load text
loader = TextLoader("./data/genesi.txt")
content = loader.load()[0].page_content

In [3]:
import re

chapters = re.split(r"(?=Capitolo\s+\d+)", content)

In [4]:
# Define separators for text
separators = [
   ". ",
   "\n\n",
   "\n",
]

In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Create chunks
docs = []
splitter = RecursiveCharacterTextSplitter(separators=separators,chunk_size=400, chunk_overlap=200)

In [6]:
for chapter_text in chapters:
    chapter_text = chapter_text.strip()
    if not chapter_text:
        continue

    # Add a metadata with chapter name
    match = re.match(r"(Capitolo\s+\d+)", chapter_text)
    chapter_title = match.group(1) if match else "Unknown"

    chapter_docs = splitter.create_documents([chapter_text], metadatas=[{"chapter": chapter_title,"book_title":book_title}])
    docs.extend(chapter_docs)

In [7]:
from langchain_huggingface import HuggingFaceEmbeddings

# Define and load an embedding model, nomic it's optmized for italian
embeddings_model = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large")

In [8]:
from langchain_qdrant import QdrantVectorStore

# Start embedding
qdrant = QdrantVectorStore.from_documents(
    docs,
    embedding=embeddings_model,
    url=connection_string,
    collection_name=collection_name,
    force_recreate=True,
    distance="Cosine"
)