In [1]:
from langchain_community.document_loaders import TextLoader

# Load text
loader = TextLoader("./data/genesi.txt")
content = loader.load()

In [2]:
# Define separators for text
separators = [
    "\n\n",
    "\n",
]

In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Create chunks
splitter = RecursiveCharacterTextSplitter(separators=separators,chunk_size=400, chunk_overlap=100)
chunks = splitter.split_documents(content)

In [4]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.chat_models import ChatOllama
from langchain_core.output_parsers import StrOutputParser

prompt_text = "Summarize the following document:\n\n{doc}"
prompt = ChatPromptTemplate.from_template(prompt_text)
llm = ChatOllama(temperature=0, model="llama3.2")

In [5]:
summarize_chain = {
    "doc": lambda x: x.page_content
    } | prompt | llm | StrOutputParser()

In [None]:
summaries = summarize_chain.batch(chunks, {"max_concurrency":10})

In [None]:
from langchain_ollama import OllamaEmbeddings

# Define and load an embedding model, nomic it's optmized for italian
embeddings_model = OllamaEmbeddings(model="nomic-embed-text")

In [None]:
from langchain_qdrant import QdrantVectorStore
connection_string = "http://localhost:6333"

# Start embedding
qdrant = QdrantVectorStore.from_documents(
    chunks,
    embedding=embeddings_model,
    url=connection_string,
    collection_name="chap02"
)

In [None]:
from langchain_classic.storage import InMemoryStore

store = InMemoryStore()
id_key = "doc_id"

In [None]:
from langchain_classic.retrievers import MultiVectorRetriever

retriever = MultiVectorRetriever(
    vectorstore=qdrant,
    docstore=store,
    id_key=id_key
)

In [None]:
import uuid

doc_ids = [str(uuid.uuid4()) for _ in chunks]

In [None]:
from langchain_core.documents import Document

summary_docs = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(summaries)
]

In [None]:
retriever.docstore.mset(list(zip(doc_ids, chunks)))

In [None]:
retriever.vectorstore.similarity_search("mangiarono",k=2)

In [None]:
retriever.invoke("mangiarono")