In [None]:
# pip install required modules

In [None]:
# imports
import getpass
import os

from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores import Chroma
# from langchain_community.document_loaders import PyPDFLoader
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.chains.llm import LLMChain
# LangChain I/O
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
# LCEL
from langchain.schema.runnable import RunnablePassthrough
# Summerize document
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
import textwrap

# for langchain indexing API
from langchain.indexes import SQLRecordManager, index

Lets add Documents and Embeddings!

In [None]:
# Load documents
file_path = './bella_vista.txt'
loader = TextLoader(file_path)
documents = loader.load()
print(f"Loaded {len(documents)} documents from the folder.")


text_splitter = CharacterTextSplitter(chunk_size=150, chunk_overlap=20)
docs = text_splitter.split_documents(documents)
print(len(docs))

# create embeddings model
embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")

In [None]:
# Initialize Chroma Vectorstore with OpenAI embeddings
collection_name = "test_index"
vectorstore = Chroma(collection_name=collection_name, embedding_function=embeddings, persist_directory="./chroma_db")

# Initialize a record manager to track document writes
namespace = f"chroma/{collection_name}"
record_manager = SQLRecordManager(namespace, db_url="sqlite:///record_manager_cache.sql")
record_manager.create_schema()

In [None]:
# # Helper function to clear content (used for setup)
# def _clear():
#     index([], record_manager, vectorstore, cleanup="full", source_id_key="source")

# # Clear the vector store and record manager (setup for a clean state)
# _clear()

In [None]:
# Index documents with None deletion mode
# Feature: No automatic cleanup of old content
# Explanation: Only one unique document is added, even though `doc` is provided multiple times.
index([doc, doc, doc, doc, doc], 
    record_manager, 
    vectorstore, 
    cleanup=None, 
    source_id_key="source")

In [None]:
# Load documents - doc2
file_path2 = './bella_vista_2.txt'
loader2 = TextLoader(file_path2)
documents2 = loader.load()
print(f"Loaded {len(documents2)} documents from the folder.")

docs2 = text_splitter.split_documents(documents2)
print(len(docs2))

In [None]:
# Index new documents
# Explanation: `doc` is skipped (already indexed), and `doc2` is added.
index([doc, doc2], 
    record_manager, 
    vectorstore, 
    cleanup=None, 
    source_id_key="source")

In [None]:
# Second run skips all content
# Explanation: Both documents are already indexed, so nothing is added or updated.
index([doc, doc2], 
    record_manager, 
    vectorstore, 
    cleanup=None, 
    source_id_key="source")

In [None]:
# Index documents with Incremental deletion mode
# Feature: Automatically cleans up old versions of content
# Explanation: Both documents are added to the vector store.
index(
    [doc, doc2],
    record_manager,
    vectorstore,
    cleanup="incremental",
    source_id_key="source",
)

In [None]:
# Index documents with Full deletion mode
# Feature: Cleans up all old content before indexing new documents
# Explanation: All previous content is removed, and only the new documents are indexed.
index([], 
record_manager, 
vectorstore, 
cleanup="full", 
source_id_key="source")